Summary of the demographics and soil characteristics of the Rwanda long term soil health study.
Add in details and links on study methodology here.
library(knitr)
library(ggplot2)
library(stringr)
suppressMessages(library(dplyr))
library(sp)
suppressMessages(library(rgdal))
suppressMessages(library(dismo))
suppressMessages(library(stargazer))
library(leaflet)
library(XML)
suppressMessages(library(maptools))
library(automap)
suppressMessages(library(RStata))
suppressMessages(library(fields))
library(gstat)
library(htmltools)
suppressMessages(library(Matching))
library(reshape2)
options("RStata.StataVersion" = 12)
options("RStata.StataPath" = "/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
#chooseStataBin("/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
The working directory objects are from the original baseline analysis. I’ve updated this to pull from a stable working directory (5/4/17)
# wd <- "/Users/mlowes/drive/optimized_agronomy/soil/soil_health_study/data/rw baseline"
# dd <- paste(wd, "data", sep = "/")
# od <- paste(wd, "output", sep="/")
# md <- paste(wd, "maps", sep="/")
drive <- "~/drive/r_help/4_output/statistical_test_outputs"
od <- "output" # come back and update this later. I got about half way through?
#load data:
# This data is being drawn from the Soil lab repository. It has the baseline data with it
# d <- read.csv(paste(dd, "Rwanda_shs_commcare_soil_data_final.csv", sep="/"), stringsAsFactors=FALSE)
Replicate Alex’s and Emmanuel’s merge process using “Identifiers with SSN_final” provided by Emmanuel. I use the RStata package found here
# I'm replicating Alex's do file here.
stata("merge_shs.do")
. * merge raw commcare data with soil database
.
. * date: 11 july 2016
.
. clear all
. set more off
.
. * set directory
. global wd "/Users/mlowes/drive/soil health study/data/rw baseline"
. global dd "/Users/mlowes/drive/soil health study/data/rw baseline/data"
. global troubleshoot "/Users/mlowes/drive/soil health study/data/rw baseline/t
> roubleshoot"
.
. * insheet data
. insheet using "$dd/raw_rwanda_commcare_shs.csv", clear
file /Users/mlowes/drive/soil health study/data/rw
baseline/data/raw_rwanda_commcare_shs.csv not found
r(601);
.
. * clean sample_id variable
. replace sample_id = "" if sample_id == "---"
. replace sample_id = "-99" if sample_id == "99"
.
. destring sample_id, replace
.
. * manipulate current "sample_id" to become a proper alpha-numeric unique iden
> tifier
. * this simply involves adding "C" to each numeric code that belongs to a cont
> rol farmer
.
. * variables of interest:
. * d_client
. * sample_id
.
. ren demographic_infod_client d_client
. replace d_client = "" if d_client == "---"
. destring d_client, replace
.
. tostring sample_id, gen(sample_id_string)
.
. replace sample_id_string = sample_id_string + "C" if d_client == 0
. drop sample_id
. ren sample_id_string sample_id
.
. ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
> ***
. * quantify and note cases where sample_id appears more than twice
. * 39 codes appear 2x
. * 1 code appears 3x
. * 1 code appears 4x
. * 1 code appears 10x
. * 1 code appears 19x
.
. duplicates report sample_id
. duplicates tag sample_id, gen(n_duplicate_id)
. gen d_id_problem = 1 if n_duplicate_id != 0
. replace d_id_problem = 0 if missing(d_id_problem)
.
. ** need to investigate 5% of observations (114 samples), i.e. d_id_problem =
> 1**
. ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
> ***
.
. * outsheet problematic observations and examine 1-by-1
. sort sample_id
. outsheet demographicnom_enqueteur district demographic_infocellule_selected d
> emographic_infoumudugudu d_client demographic_infonom_cultivateur sample_id d
> emographic_infosex demographic_infoage_cultivateur demographic_infon_telephon
> e_cult d_id_problem using "$dd/shs_to_check.csv" if d_id_problem == 1, replac
> e comma
.
. **************** post-investigation correction of incorrect ids *************
> ***
.
. ren demographic_infonom_cultivateur farmer_name
. ren demographic_infocellule_selected selected_cell
. ren demographicnom_enqueteur enumerator_name
.
. * drop test/accidentally double-entered observations
. drop if infoformid == "f138897f-6e80-4b0a-9db3-ab2134cd9e51"
. drop if farmer_name == "Mugisha"
. drop if farmer_name == "Muhunde yoramu"
. drop if farmer_name == "Test"
. drop if farmer_name == "Billy"
. drop if farmer_name == "Gakuba michel"
. drop if farmer_name == "Renata"
. drop if farmer_name == "Bub"
. drop if farmer_name == "M"
. drop if farmer_name == "Anita"
. drop if farmer_name == "Jean"
. drop if farmer_name == "Nyirazaninka lea"
. drop if farmer_name == "Mugiraneza Pacifiue"
. drop if farmer_name == "Nsengiyaremye Innocent"
.
. * correct incorrectly recorded ids
. replace sample_id = "880C" if farmer_name == "Nyiransanzineza Marie Rose"
. replace sample_id = "811C" if farmer_name == "Ngirabatware Daniel"
. replace sample_id = "646C" if farmer_name == "Nyirasabwa anonciata"
. replace sample_id = "624C" if farmer_name == "Nyirinkindi j Bosco"
. replace sample_id = "560C" if farmer_name == "Mubirigi Bertin"
. replace d_client = 0 if farmer_name == "Mubirigi Bertin"
. replace sample_id = "375C" if farmer_name == "Hitimana ezecquiere"
. replace sample_id = "322C" if farmer_name == "Nyirantabire arrivera"
. replace sample_id = "2566C" if farmer_name == "MUSABYIMANA Colletta"
. replace sample_id = "2399C" if farmer_name == "Nyirabajyambere anastisia"
. replace d_client = 0 if farmer_name == "Nyirabajyambere anastisia"
. replace sample_id = "2280C" if farmer_name == "Zihabake Simon"
. replace sample_id = "2202C" if farmer_name == "mugiraneza pacifique"
. replace sample_id = "1780C" if farmer_name == "Hakuzimana Theophile"
. replace sample_id = "1757C" if farmer_name == "Bavugirije Feleciane"
. replace sample_id = "1626C" if farmer_name == "Nyabivumu"
. replace sample_id = "1575C" if farmer_name == "BAHIMBANDE Beriyana."
. replace d_client = 0 if farmer_name == "BAHIMBANDE Beriyana."
. replace sample_id = "1103C" if farmer_name == "Barirwanda Elyse"
. replace sample_id = "1037C" if farmer_name == "Mukashema Faraziya"
. replace d_client = 0 if farmer_name == "Mukashema Faraziya"
. replace sample_id = "3069" if farmer_name == "Ndababonye Silas"
. replace sample_id = "2611" if farmer_name == "Sebazungu modeste"
. replace d_client = 1 if farmer_name == "Sebazungu modeste"
. replace sample_id = "2415" if farmer_name == "Nkaka joseph"
. replace sample_id = "2412" if farmer_name == "Ndagijimana alexis"
. replace sample_id = "2410" if farmer_name == "Sebuhoro elie"
. replace sample_id = "2406" if farmer_name == "Mugisha ruth"
. replace sample_id = "2405" if farmer_name == "Mboneza hasan"
. replace sample_id = "2404" if farmer_name == "Ntibitonderwa veriane"
. replace sample_id = "2399" if farmer_name == "Nyiraburakeye feresita"
. replace sample_id = "2395" if farmer_name == "Uwimana danier"
. replace sample_id = "2394" if farmer_name == "Bizinde silas"
. replace sample_id = "2390" if farmer_name == "Nyirahabimvana keresesiya"
. replace sample_id = "2388" if farmer_name == "Nyirasekerabanzi tharcilla"
. replace sample_id = "2386" if farmer_name == "Ndererimana emmanuel"
. replace sample_id = "2291" if farmer_name == "Ndacyayisenga Feresiyani"
. replace sample_id = "2184" if farmer_name == "Muhawenimana Beretirida"
. replace sample_id = "2145" if farmer_name == "Fapfakwita celestine"
. replace sample_id = "2141" if farmer_name == "Nsangiranabo krizoroji"
. replace sample_id = "2140" if farmer_name == "Nsabyumuremyi anakereti"
. replace sample_id = "2136" if farmer_name == "Nyirantibitonderwa savoroniya"
. replace sample_id = "2131" if farmer_name == "Hagenimana Ewufroni"
. replace sample_id = "2066" if farmer_name == "Hitayezu vicent"
. replace d_client = 1 if farmer_name == "Hitayezu vicent"
. replace sample_id = "2044" if farmer_name == "Niyodushima janette"
. replace d_client = 1 if farmer_name == "Niyodushima janette"
. replace sample_id = "2024" if farmer_name == "Mukabatsinda veren A"
. replace sample_id = "2020" if farmer_name == "Sibomana innocent"
. replace sample_id = "2009" if farmer_name == "Nsabimana inyasi"
. replace sample_id = "2002" if farmer_name == "Mateso elizabet"
. replace sample_id = "1889" if farmer_name == "Niyonsaba Daniel"
. replace sample_id = "1864" if farmer_name == "Mukankubana Souzanne"
. replace sample_id = "1780" if farmer_name == "Nyirantihabose Annonciatha"
. replace sample_id = "1771" if farmer_name == "Bizimungu Damascne"
. replace sample_id = "1675" if farmer_name == "Bikorimana Isaac"
. replace sample_id = "1103" if farmer_name == "Munyabarata japhette"
. replace sample_id = "954" if farmer_name == "Hahirwabake silver"
. replace sample_id = "541" if farmer_name == "Rugemintwaza Vianney"
. replace sample_id = "454" if farmer_name == "Niyonizeye fororida"
. replace sample_id = "407" if farmer_name == "Munyaneza jean pierre"
. replace sample_id = "375" if farmer_name == "Munyarubuga nanias"
. replace sample_id = "322" if farmer_name == "Gacandaga zackarie"
. replace sample_id = "262" if farmer_name == "MUHIRWA J Damascene"
. replace sample_id = "32" if farmer_name == "Nsanzemuhire"
.
. *** re-run duplicates test
. drop d_id_problem
. drop n_duplicate_id
.
. duplicates report sample_id
. duplicates tag sample_id, gen(n_duplicate_id)
. gen d_id_problem = 1 if n_duplicate_id != 0
. replace d_id_problem = 0 if missing(d_id_problem)
.
. * second try: outsheet problematic observations and examine 1-by-1
. sort sample_id
. outsheet using "$dd/shs_to_check_2.csv" if d_id_problem == 1, replace comma
.
. * second try: correct incorrect ids
.
. replace sample_id = "2202" if farmer_name == "Mukamurara scholastique"
. replace d_client = 1 if farmer_name == "Mukamurara scholastique"
. replace sample_id = "2195" if farmer_name == "Ndacyayisenga Feresiyani"
. replace sample_id = "824" if farmer_name == "Ngirabatware Daniel"
.
. replace sample_id = "" if rownumber == 238
. replace sample_id = "" if rownumber == 848
. replace sample_id = "" if rownumber == 1257
. replace sample_id = "" if rownumber == 1732
.
. *** re-run duplicates test
. drop d_id_problem
. drop n_duplicate_id
.
. duplicates report sample_id
. duplicates tag sample_id, gen(n_duplicate_id)
. gen d_id_problem = 1 if n_duplicate_id != 0
. replace d_id_problem = 0 if missing(d_id_problem)
.
. * third try: outsheet problematic observations
. sort sample_id
. outsheet using "$dd/shs_to_check_3.csv" if d_id_problem == 1, replace comma
.
. *** 4 sets of duplicates remain (2 observations)
. drop if d_id_problem == 1
. save "$dd/rwanda_commcare_shs_clean.dta", replace
. outsheet using "$dd/rwanda_commcare_shs_clean.csv", comma replace
.
. * merge datasets
. insheet using "$dd/mne_rwanda_database_shs.csv", clear
.
. save "$dd/mne_rwanda_database_shs.dta", replace
.
. use "$dd/rwanda_commcare_shs_clean.dta", clear
.
. merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
.
. * 93 observations unmatched... :(
.
. * investigate unmatched observations
. sort sample_id
. outsheet using "$troubleshoot/unmatched_commcare.csv" if _merge == 1, comma r
> eplace
. outsheet using "$troubleshoot/unmatched_inventory.csv" if _merge == 2, comma
> replace
.
. * make first round of corrections in master data
. use "$dd/rwanda_commcare_shs_clean.dta", clear
.
. replace sample_id = "2711C" if farmer_name == "Kamuhanda Elie"
. replace sample_id = "260C" if farmer_name == "HATEGEKIMANA Mathias"
. replace sample_id = "245" if farmer_name == "Kayiranga Michel"
. replace sample_id = "1543" if farmer_name == "Ndagijimana Eliezel"
. replace sample_id = "1543C" if farmer_name == "Mukantanganzwa odeta"
. replace sample_id = "1632" if farmer_name == "Nyiranziguye Cecile"
. replace sample_id = "2782" if farmer_name == "Mukamungu leocadie"
. replace sample_id = "421C" if farmer_name == "Niyonizeye Francine"
. replace sample_id = "828C" if farmer_name == "Mukamazimpaka Terese"
. replace sample_id = "364" if farmer_name == "Mukamuyango verina"
. replace sample_id = "364C" if farmer_name == "Mujawimana jeanne"
. replace sample_id = "368" if farmer_name == "Nyiramanzi jean damascene"
. replace sample_id = "368C" if farmer_name == "Karwiyegura rachel"
. replace sample_id = "391" if farmer_name == "Mugemane augistin"
. replace sample_id = "391C" if farmer_name == "Ngirishuti sumayire"
. replace sample_id = "395" if farmer_name == "Niyomugabo amiel"
. replace sample_id = "395C" if farmer_name == "Ntawiniga augustin"
. replace sample_id = "396" if farmer_name == "Mpayimana philippe"
. replace sample_id = "329" if farmer_name == "Mucyezangango emmanuel"
. replace sample_id = "411" if farmer_name == "Nsabimana mathias"
. replace sample_id = "411C" if farmer_name == "Singirankabo"
. replace sample_id = "414" if farmer_name == "Rutaharama eldephonse"
. replace sample_id = "329C" if farmer_name == "Mukamana josephine"
. replace sample_id = "657" if farmer_name == "Nyandwi vincent"
. replace sample_id = "569C" if farmer_name == "Uwimana Bercilla"
. replace sample_id = "2051C" if farmer_name == "Ntambabazi Anastase"
. replace sample_id = "2249" if farmer_name == "Uwineza valentine"
. replace sample_id = "2360C" if farmer_name == "Bamporineza j deDieu"
. replace sample_id = "2001" if farmer_name == "Nyiramakuba thanene"
. replace sample_id = "2897C" if farmer_name == "Mukagatanazi Marianne"
. replace sample_id = "2965C" if farmer_name == "Twizeyimana Theoneste"
.
. drop if farmer_name == "Yjk"
. drop if farmer_name == "Uwambajimana Agnes"
. drop if farmer_name == "Tr"
.
. replace sample_id = "2415C" if farmer_name == "Nyiramahirwe Frolance"
. replace sample_id = "851" if farmer_name == "Bigirimana Amon"
.
.
. * duplicates check
. drop d_id_problem
. drop n_duplicate_id
.
. duplicates report sample_id
. duplicates tag sample_id, gen(n_duplicate_id)
. gen d_id_problem = 1 if n_duplicate_id != 0
. replace d_id_problem = 0 if missing(d_id_problem)
.
. * second try: merge
. merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
.
. * deal with 30 unmatched cases
. sort sample_id
. outsheet using "$troubleshoot/unmatched_commcare_2.csv" if _merge == 1, comma
> replace
. outsheet using "$troubleshoot/unmatched_inventory_2.csv" if _merge == 2, comm
> a replace
.
. * 5 submitted surveys with no corresponding sample
. *drop if _merge == 1
.
. * 19 samples with no corresponding survey
. *drop if _merge == 2
.
. * outsheet merged database
. outsheet using "$dd/rwanda_shs_baseline_data.csv", comma replace
. * THIS IS THE CLEAN VERSION! ONLY USE THIS! DON'T USE OTHER THINGS!
.
Import the results of Alex’s do file.
d <- read.csv("data/rwanda_shs_baseline_data.csv", stringsAsFactors = F)
Identifiers <- read.csv("data/Identifiers with SSN_final.csv", stringsAsFactors = F)
wetChem <- read.csv(paste("/Users/mlowes/drive/jupyter/Rwanda Acidity", "Original chem_rwanda_shs.csv", sep = "/"))
d <- left_join(d, Identifiers, by="sample_id")
# now import the soil results and merge with survey data
soilDir <- normalizePath(file.path("..", "..", "OAF Soil Lab Folder", "Projects", "rw_shs_baseline", "4_predicted", "other_summaries"))
soilRF <- read.csv(file=paste(soilDir, "combined-predictions-including-bad-ones.csv", sep="/"))
orgSoilNames <- read.csv("data/rwshs_rfresults.csv", stringsAsFactors = F) #old from original variable work
names(orgSoilNames)[names(orgSoilNames)=="X"] <- "SSN"
Match new soil model outputs to variable names from original analysis in 2016 and join soil variables with survey data
soil <- soilRF %>% dplyr::select(SSN, Calcium, Magnesium, pH, Total.Nitrogen, Organic.Carbon, Exchangeable.Aluminium) %>%
rename(
m3.Ca = Calcium,
m3.Mg = Magnesium,
Total.N = Total.Nitrogen,
Total.C = Organic.Carbon,
ExAl = Exchangeable.Aluminium
) %>% mutate(
SSN = as.character(SSN)
)
d <- left_join(d, soil, by="SSN")
Now let’s start cleaning the demographic variables
d[d=="---"] <- NA
names(d) <- gsub("text_final_questions", "", names(d))
names(d) <- gsub("intro_champ_echantillon", "", names(d))
names(d) <- gsub("demographic_info", "", names(d))
names(d) <- gsub("other_inputs_", "", names(d))
names(d) <- gsub("crop1_15b_inputs", "", names(d))
names(d) <- gsub("crop2_15b_inputs", "", names(d))
names(d) <- gsub("^15b", "", names(d))
names(d) <- gsub("historical_intro", "", names(d))
names(d)[names(d)=="field_dim"] <- "field_dim1"
names(d)[names(d)=="v51"] <- "field_dim2"
Address unusual field sizes
ggplot(d, aes(x=field_dim1, y=field_dim2)) + geom_point()
It seems really unlikely that fields are 200 meters long while only being 20 meters wide. I don’t know how to check this though.
# clean field dimensions here - winsor the values to something reasonable.
d[(d$field_dim1>=100 | d$field_dim2>=100) & !is.na(d$field_dim1), c("field_dim1", "field_dim2")]
Take care of demographic data formatting issues
# deal with names and drop unnecessary variables
d <- d %>%
dplyr::select(-c(rownumber, infoformid, introductiond_accept, photo,
infocompleted_time,
enumerator_name, contains("phone"), farmer_name, farmersurname, farmername,
d_respondent, additionalsamplepackedandsenttol, additionalsamplerequestedfromlab,
datedryingcompleteifnecessary, driedindistrictifnecessary, senttohqyo,
collectedindistrictyo, excessstoredathq_, receivedathq_,dateofinitialdryingifnecessary,
samplecollectedinfieldyo, field_des, samplewetordry)) %>%
rename(
female = sex,
age = age_cultivateur,
own = d_own,
client = d_client) %>%
mutate(
female= ifelse(female=="gore", 1,0),
field.size = field_dim1*field_dim2
)
d$total.seasons <- apply(d[, grep("d_season_list", names(d))], 1, function(x) {
sum(x, na.rm=T)})
Clean agronomic practice variables
agVars <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow",
"n_seasons_leg_1", "n_seasons_leg_2")
summary(d[,agVars])
n_season_fert n_season_compost n_season_lime n_season_fallow n_seasons_leg_1 n_seasons_leg_2
Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : -88.000
1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000
Median : 1.000 Median : 5.000 Median : 0.0000 Median : 0.0000 Median : 1.000 Median : 2.000
Mean : 2.041 Mean : 5.651 Mean : 0.1819 Mean : 0.6355 Mean : 2.171 Mean : 4.368
3rd Qu.: 3.000 3rd Qu.:10.000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 4.000 3rd Qu.: 5.000
Max. :10.000 Max. :10.000 Max. :10.0000 Max. :10.0000 Max. :10.000 Max. :3333.000
NA's :19 NA's :19 NA's :19 NA's :19 NA's :19 NA's :19
Sort out the legumes as a second crop
table(d$n_seasons_leg_2, useNA = 'ifany')
-88 0 1 2 3 4 5 6 7 8 9 10 15 22 50 53 83 88 101 102 3333 <NA>
1 956 221 258 174 130 301 61 34 66 57 199 1 1 1 3 1 1 1 1 1 19
d$n_seasons_leg_2 <- ifelse(d$n_seasons_leg_2 <0 | d$n_seasons_leg_2>10, NA, d$n_seasons_leg_2)
table(d$client, useNA = 'ifany')
0 1 <NA>
1233 1236 19
d[is.na(d$client), c("sample_id")]
[1] "1189C" "1207C" "1295" "1295C" "1298C" "1300" "1300C" "1366C" "1476C" "1485C" "1554" "1573" "1614" "2042" "2371C" "2373" "2375"
[18] "2382" "2442"
# replace client based on whether there is a C in the client variable.
d$client.check <- ifelse(grepl("C", d$sample_id)==T, 0, 1)
table(d$client, d$client.check)
0 1
0 1233 0
1 1 1235
It looks like most farmers were recorded correctly except for one farmer who was coded as Tubura farmer but their sample_id indicate their a control. Let’s take a look:
d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), c("sample_id", "tuburacontroltc")]
# they should be a control
d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), "client"] <- 0
# remove farmers for which we have soil data but no survey data (using)
d <- d[-which(grepl("using", d$X_merge)),]
table(d$client, d$client.check, useNA = 'ifany')
0 1
0 1234 0
1 0 1235
# update client to equal client check
d$client <- d$client.check
Fix some more variable names:
names(d)[names(d)=="field_kg_fert1_1"] <- "field_kg_fert1_15b"
names(d)[names(d)=="field_kg_fert2_1"] <- "field_kg_fert2_15b"
names(d)[names(d)=="field_kg_compost"] <- "field_kg_compost_15b"
Recode variables to numeric:
# recode to numeric
varlist <- c("client", "own", "crop1_15b_seedkg", "crop1_15b_yield", "crop1_15b_yield_",
"crop2_15b_seedkg", "crop2_15b_yield", "crop2_15b_yield_", "field_kg_fert1_15b",
"field_kg_fert2_15b", "field_kg_compost_15b", "d_lime_15b", "kg_lime_15b")
# check that there aren't values hidden in the character variables
#apply(d[,varlist], 2, function(x){table(x, useNA='ifany')})
# recode characters to numerics
d[, varlist] <- sapply(d[,varlist], as.numeric)
table(d$kg_lime_15b, useNA = 'ifany')
-88 1 3 4 5 10 13 15 20 25 30 35 50 60 88 100 150 200 <NA>
1 2 2 2 6 5 1 1 2 5 1 1 8 1 2 2 4 1 2422
d$kg_lime_15b <- ifelse(abs(d$kg_lime_15b)==88, NA, d$kg_lime_15b)
# divide out GPS coordinates
# http://rfunction.com/archives/1499
# replace the blank gps_pic_guide with info
d <- cbind(d, str_split_fixed(d$gps_pic_guid, " ", n=4))
names(d)[names(d)=="1" |names(d)== "2" | names(d)== "3" | names(d)== "4"] <- c("lat", "lon", "alt", "precision")
d[,c("lat", "lon", "alt", "precision")] <- sapply(d[,c("lat", "lon", "alt", "precision")],
function(x){as.numeric(as.character(x))})
Cleaning of soil data: Come back, check and clean the soil data before outputting to clean data set. Plot each of the soil variables to look for unrealistic values.
dim(d[is.na(d$m3.Ca),])
[1] 30 111
d <- d[-which(is.na(d$m3.Ca)),]
summary(d[,c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C", "ExAl")])
m3.Ca m3.Mg pH Total.N Total.C ExAl
Min. : 90.8 Min. : 39.45 Min. :3.840 Min. :0.0600 Min. :0.8749 Min. :0.01578
1st Qu.: 358.0 1st Qu.: 121.32 1st Qu.:4.995 1st Qu.:0.1300 1st Qu.:1.7290 1st Qu.:0.13532
Median : 620.5 Median : 184.74 Median :5.470 Median :0.1500 Median :2.0301 Median :0.30884
Mean : 821.3 Mean : 209.82 Mean :5.483 Mean :0.1521 Mean :2.0791 Mean :0.59145
3rd Qu.:1102.5 3rd Qu.: 262.09 3rd Qu.:5.950 3rd Qu.:0.1700 3rd Qu.:2.3466 3rd Qu.:0.93700
Max. :6090.4 Max. :1052.13 Max. :8.140 Max. :0.3500 Max. :4.0658 Max. :2.50849
names(d)[names(d)=="general_field_infosoil_type"] <- "soil_type"
names(d)[names(d)=="general_field_infotexture_soil"] <- "soil_texture"
d$black_soil <- ifelse(d$soil_type=="black", 1,0)
d$red_soil <- ifelse(d$soil_type=="red", 1,0)
d$sandy_soil <-ifelse(d$soil_type=="sandy", 1,0)
Let’s check out the rows for which we don’t have soil data and drop them as they won’t contribute to the full picture.
soilVars <- c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C", "ExAl")
for(i in 1:length(soilVars)){
print(
ggplot(data=d, aes(x=as.factor(client), y=d[,soilVars[i]])) +
geom_boxplot() +
labs(x="Tubura Farmer", y=soilVars[i], title = paste("RW baseline soil - ", soilVars[i], sep = ""))
)
}
There are biologically predictable relationships between soil chemical characteristics. For instance, we expect Ca and Mg to move in the same direction and be positively correlated with pH. If we had Aluminum as an outcome, we’d expect pH to be negatively correlated with soluable aluminum. Let’s look quickly to confirm if those relationships are present:
ggplot(d, aes(x=m3.Ca, y=m3.Mg)) + geom_point() +
stat_smooth(method="loess") +
labs(x = "Calcium (m3)", y= "Magnesium (m3)", title="Calcium and Magnesium relationship")
ggplot(d, aes(x=pH, y=m3.Ca)) + geom_point() +
stat_smooth(method="loess") +
labs(x = "pH", y="Calcium (m3)", title = "pH and Calcium relationship")
ggplot(d, aes(x=pH, y=m3.Mg)) + geom_point() +
stat_smooth(method="loess") +
labs(x = "pH", y="Magnesium (m3)", title = "pH and Magnesium relationship")
ggplot(d, aes(x=pH, y=ExAl)) + geom_point() +
stat_smooth(method="loess") +
labs(x = "pH", y="Exchangeable Aluminum", title = "pH and Aluminum relationship")
ggplot(d, aes(x=Total.C, y=Total.N)) + geom_point() +
stat_smooth(method="loess") +
labs(x = "Total Carbon", y="Total Nitrogen", title = "Carbon and Nitrogen relationship")
The soil characteristics are moving in the manner that is consistent with our understanding of soil chemical processes.
Save clean demographic and soil data to external file
write.csv(d, file="data/shs rw baseline.csv")
save(d, file="data/shs rw baseline.Rdata")
Produce a simple map of where our observations are
See here for more on using markerClusterOptions in leaflet.
In the map below, the larger green circles are Tubura farmers and the smaller blue circles are control farmers.
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e)
pal <- colorNumeric(c("navy", "green"), domain=unique(ss$client))
map <- leaflet() %>% addTiles() %>%
setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
addCircleMarkers(lng=ss$lon, lat=ss$lat,
radius= ifelse(ss$client==1, 10,6),
color = pal(ss$client),
clusterOptions = markerClusterOptions(disableClusteringAtZoom=13, spiderfyOnMaxZoom=FALSE))
map
count <- d %>% group_by(district) %>%
dplyr::summarize(
t.count = sum(ifelse(client==1,1,0)),
c.count = sum(ifelse(client==0,1,0)),
total = n()
) %>% ungroup()
count <- as.data.frame(count)
write.csv(count, file="data/final rw sample breakdown.csv", row.names=F)
#as.data.frame(count)
Let’s see how balanced our farmers are in terms of demographic variables. Tubura farmers were selected based on (list criteria) and control farmers in the same area tha fit the same criteria were also selected. No matching process has been performed to identify the control farmers that most closely resemble the Tubura farmers in the sample. These results are entirely reflecting the balance inherent in the identification process, not any statistical matching of treatment and control.
d$valley <- ifelse(d$general_field_infofield_location=="valley", 1,0)
d$hillside <- ifelse(d$general_field_infofield_location=="hillside", 1,0)
d$hilltop <- ifelse(d$general_field_infofield_location=="hilltop", 1,0)
names(d)[names(d)=="general_field_infograde_hill"] <- "slope"
cor(d[, grep("betail_", names(d))], use='complete.obs')
betail_ownedn_inka betail_ownedn_ihene betail_ownedn_inkoko betail_ownedn_ingurube betail_ownedn_intama
betail_ownedn_inka 1.00000000 0.035195326 0.09700986 0.021287801 0.040572776
betail_ownedn_ihene 0.03519533 1.000000000 0.12626563 -0.009531107 -0.002990226
betail_ownedn_inkoko 0.09700986 0.126265627 1.00000000 0.049825561 0.025680156
betail_ownedn_ingurube 0.02128780 -0.009531107 0.04982556 1.000000000 0.020300946
betail_ownedn_intama 0.04057278 -0.002990226 0.02568016 0.020300946 1.000000000
names(d)[grep("betail_", names(d))] <- c("cows", "goats", "chickens", "pigs", "sheep")
d$own.cows <- ifelse(d$cows>0 & !is.na(d$cows), 1,0)
d$own.goats <- ifelse(d$goats>0 & !is.na(d$goats), 1,0)
d$own.chickens <- ifelse(d$chickens>0 & !is.na(d$chickens), 1,0)
d$own.pigs <- ifelse(d$pigs>0 & !is.na(d$pigs), 1,0)
d$own.sheep <- ifelse(d$sheep>0 & !is.na(d$sheep), 1,0)
names(d)[names(d)=="inputuse_15b_priorculture_15b_1"] <-"crop_15b"
d$crop_15b <- tolower(d$crop_15b)
sort(prop.table(table(d$crop_15b, useNA = 'ifany')))
kor hwag sheke coffee cer uburo teke insina shaz nyo gan
0.0004100041 0.0008200082 0.0008200082 0.0012300123 0.0024600246 0.0036900369 0.0041000410 0.0057400574 0.0086100861 0.0131201312 0.0196801968
other_veg ray yum gor ntacyo soya jum big saka shy
0.0209102091 0.0524805248 0.0586305863 0.0594505945 0.0594505945 0.0791307913 0.0914309143 0.1599015990 0.1648216482 0.1931119311
d$climbing_bean <- ifelse(d$crop_15b=="shy", 1,0)
d$sorghum <- ifelse(d$crop_15b=="saka", 1,0)
d$bush_bean <- ifelse(d$crop_15b=="big", 1,0)
d$maize <- ifelse(d$crop_15b=="gor", 1,0)
out.list <- c("female", "age", "hhsize", "own", "field.size",
"n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2", "slope", "alt", "valley", "hillside", "hilltop", "cows", "goats", "chickens", "pigs", "sheep", "climbing_bean", "sorghum", "bush_bean","maize", soilVars, "own.cows", "own.goats", "own.chickens", "own.pigs", "own.sheep", "black_soil", "red_soil", "sandy_soil")
output <- do.call(rbind, lapply(out.list, function(x) {
out <- t.test(d[,x] ~ d[,"client"], data=d)
tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
tab[,1:2] <- round(tab[,1:2],3)
names(tab) <- c(names(out[[5]]), "pvalue")
return(tab)
}))
# use p.adjust with bonferroni correction
output$pvalue <- p.adjust(output$pvalue, method="fdr")
rownames(output) <- out.list
output <- output[order(output$pvalue),]
output$pvalue <- ifelse(output[, 3] < 0.001, "< 0.001", round(output[, 3], 3))
colnames(output) <- c("Tubura Client","Non-Tubura", "p-value")
print(kable(output))
| Tubura Client | Non-Tubura | p-value | |
|---|---|---|---|
| n_season_fert | 3.270 | 0.835 | < 0.001 |
| female | 0.494 | 0.649 | < 0.001 |
| own.cows | 0.681 | 0.542 | < 0.001 |
| age | 44.020 | 48.070 | < 0.001 |
| hhsize | 5.418 | 4.897 | < 0.001 |
| n_seasons_leg_2 | 2.533 | 3.086 | < 0.001 |
| own.chickens | 0.340 | 0.267 | < 0.001 |
| own.pigs | 0.355 | 0.281 | < 0.001 |
| n_season_lime | 0.227 | 0.132 | < 0.001 |
| pigs | 0.525 | 0.394 | 0.006 |
| chickens | 1.217 | 0.898 | 0.01 |
| hillside | 0.682 | 0.733 | 0.02 |
| own | 0.931 | 0.956 | 0.027 |
| maize | 0.072 | 0.047 | 0.027 |
| valley | 0.267 | 0.224 | 0.03 |
| cows | 1.174 | 0.919 | 0.03 |
| sorghum | 0.146 | 0.184 | 0.03 |
| sheep | 0.212 | 0.144 | 0.041 |
| climbing_bean | 0.212 | 0.174 | 0.041 |
| own.sheep | 0.105 | 0.078 | 0.046 |
| n_season_fallow | 0.699 | 0.566 | 0.083 |
| n_season_compost | 5.803 | 5.524 | 0.124 |
| m3.Ca | 798.870 | 843.860 | 0.143 |
| pH | 5.464 | 5.502 | 0.209 |
| field.size | 667.682 | 606.312 | 0.29 |
| ExAl | 0.604 | 0.578 | 0.404 |
| n_seasons_leg_1 | 2.220 | 2.133 | 0.568 |
| hilltop | 0.051 | 0.044 | 0.568 |
| bush_bean | 0.154 | 0.166 | 0.568 |
| m3.Mg | 207.955 | 211.689 | 0.568 |
| Total.C | 2.072 | 2.087 | 0.568 |
| slope | 10.236 | 10.460 | 0.615 |
| Total.N | 0.152 | 0.153 | 0.615 |
| sandy_soil | 0.144 | 0.149 | 0.823 |
| black_soil | 0.537 | 0.543 | 0.835 |
| alt | 1570.722 | 1565.840 | 0.841 |
| goats | 1.074 | 1.057 | 0.841 |
| red_soil | 0.280 | 0.276 | 0.841 |
| own.goats | 0.483 | 0.486 | 0.892 |
Pre-PSM table 1 summary
table1vars <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg")
table1 <- output[rownames(output) %in% table1vars, ]
write.csv(table1, file=paste("output","rw table1.csv", sep = "/"), row.names=T)
#write table
write.csv(output, file=paste("output", "baseline balance.csv", sep="/"), row.names=T)
# write baseline balance table for ES per his table layouts
t4order <- c("age", "female", "hhsize", "own")
table4vars <- paste(t4order, collapse="|")
rw.table4 <- output[grepl(table4vars, rownames(output)),]
rw.table4 <- rw.table4[order(match(rownames(rw.table4), t4order)),]
write.csv(rw.table4, file=paste("output", "pre match balance table4.csv", sep="/"),
row.names = T)
t5order <- c("field.size", "slope", "alt", "hilltop", "hillside", "valley",
"climbing_bean", "sorghum", "bush_bean", "maize")
table5vars <- paste(t5order,collapse="|")
rw.table5 <- output[grepl(table5vars, rownames(output)),]
rw.table5 <- rw.table5[order(match(rownames(rw.table5), t5order)),]
write.csv(rw.table5, file=paste("output", "pre match balance table5.csv", sep = "/"),
row.names = T)
t6order <- c("own.cows", "cows", "own.pigs", "pigs", "own.sheep", "sheep", "own.goats", "goats","own.chickens", "chickens")
table6vars <- paste(t6order, collapse = "|")
rw.table6 <- output[grep(table6vars, rownames(output)), ]
rw.table6 <- rw.table6[order(match(rownames(rw.table6), t6order)),]
write.csv(rw.table6, file=paste("output", "pre match balance table6.csv", sep = "/"),
row.names=T)
Demographic variables We are not well balanced along the main demographic variables we collected, sex, age and HH size. For the purposes of inference we can test some matching algorithms to improve the match between Tubura and control farmers.
Agricultural practice variables We are decently balanced along agricultural practice variables. Our course of action here is similiar to our options with the demographic variables.
Soil Variables We are balanced on the primary soil variables of interest betwen our Tubura farmers and the comparison farmers.
dist.output <- do.call(rbind, lapply(split(d, d$district), function(x) {
tab <- do.call(rbind, lapply(out.list, function(y) {
out <- t.test(x[,y] ~ x[,"client"], data=x)
tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
tab[,1:2] <- round(tab[,1:2],3)
names(tab) <- c(names(out[[5]]), "pvalue")
#tab[,3] <- p.adjust(tab[,3], method="holm")
#tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
#print(tab)
return(tab)
}))
return(data.frame(district = unique(x$district), tab))
}))
rownames(dist.output) <- NULL
dist.output$variable <- rep(out.list,length(unique(d$district)))
# order variables
dist.output <- dist.output[, c(1, 5, 2:4)]
dist.output$pvalue <- p.adjust(dist.output$pvalue, method="fdr")
dist.output <- dist.output[order(dist.output$pvalue),]
dist.output$pvalue <- ifelse(dist.output$pvalue < 0.001, "< 0.001", round(dist.output$pvalue,3))
colnames(dist.output) <- c("District", "Varible", "Tubura Client","Non-Tubura", "p-value")
print(kable(dist.output))
| District | Varible | Tubura Client | Non-Tubura | p-value | |
|---|---|---|---|---|---|
| 162 | Karongi | n_season_fert | 3.809 | 0.449 | < 0.001 |
| 240 | Mugonero | n_season_fert | 4.313 | 0.592 | < 0.001 |
| 435 | Rutsiro | n_season_fert | 4.200 | 1.072 | < 0.001 |
| 318 | Nyamasheke | n_season_fert | 5.000 | 1.500 | < 0.001 |
| 474 | Rwamagana | n_season_fert | 2.723 | 1.064 | < 0.001 |
| 123 | Huye | n_season_fert | 2.268 | 0.322 | < 0.001 |
| 461 | Rutsiro | own.cows | 0.824 | 0.542 | < 0.001 |
| 45 | Gatsibo_NLWH | n_season_fert | 1.500 | 0.296 | < 0.001 |
| 279 | Nyamagabe | n_season_fert | 4.000 | 1.582 | < 0.001 |
| 132 | Huye | hillside | 0.696 | 0.922 | < 0.001 |
| 131 | Huye | valley | 0.259 | 0.052 | < 0.001 |
| 320 | Nyamasheke | n_season_lime | 0.322 | 0.041 | < 0.001 |
| 201 | Kayonza | n_season_fert | 2.093 | 0.776 | 0.001 |
| 470 | Rwamagana | age | 44.009 | 51.882 | 0.002 |
| 157 | Karongi | female | 0.463 | 0.671 | 0.005 |
| 393 | Nyaruguru | hhsize | 6.217 | 4.717 | 0.008 |
| 110 | Gisagara | own.cows | 0.696 | 0.326 | 0.008 |
| 315 | Nyamasheke | hhsize | 5.698 | 4.719 | 0.008 |
| 112 | Gisagara | own.chickens | 0.370 | 0.065 | 0.009 |
| 113 | Gisagara | own.pigs | 0.565 | 0.217 | 0.012 |
| 95 | Gisagara | cows | 1.043 | 0.413 | 0.018 |
| 119 | Huye | age | 43.491 | 49.400 | 0.021 |
| 274 | Nyamagabe | female | 0.482 | 0.764 | 0.038 |
| 337 | Nyamasheke | maize | 0.081 | 0.007 | 0.038 |
| 352 | Nyanza | female | 0.234 | 0.543 | 0.038 |
| 357 | Nyanza | n_season_fert | 1.298 | 0.152 | 0.038 |
| 97 | Gisagara | chickens | 1.196 | 0.174 | 0.039 |
| 2 | Gatsibo_LWH | age | 41.931 | 50.138 | 0.042 |
| 41 | Gatsibo_NLWH | age | 40.023 | 46.210 | 0.042 |
| 40 | Gatsibo_NLWH | female | 0.326 | 0.556 | 0.044 |
| 502 | Rwamagana | own.chickens | 0.411 | 0.227 | 0.052 |
| 242 | Mugonero | n_season_lime | 0.176 | 0.023 | 0.06 |
| 43 | Gatsibo_NLWH | own | 0.907 | 1.000 | 0.061 |
| 469 | Rwamagana | female | 0.554 | 0.736 | 0.063 |
| 196 | Kayonza | female | 0.407 | 0.672 | 0.067 |
| 149 | Huye | own.cows | 0.670 | 0.487 | 0.071 |
| 237 | Mugonero | hhsize | 5.802 | 5.031 | 0.074 |
| 432 | Rutsiro | hhsize | 5.624 | 4.867 | 0.08 |
| 1 | Gatsibo_LWH | female | 0.241 | 0.483 | 0.084 |
| 181 | Karongi | maize | 0.086 | 0.019 | 0.084 |
| 236 | Mugonero | age | 44.290 | 49.331 | 0.091 |
| 305 | Nyamagabe | own.cows | 0.732 | 0.491 | 0.105 |
| 319 | Nyamasheke | n_season_compost | 6.517 | 5.418 | 0.106 |
| 365 | Nyanza | valley | 0.298 | 0.087 | 0.107 |
| 366 | Nyanza | hillside | 0.702 | 0.913 | 0.107 |
| 334 | Nyamasheke | climbing_bean | 0.416 | 0.274 | 0.109 |
| 310 | Nyamagabe | black_soil | 0.446 | 0.218 | 0.11 |
| 62 | Gatsibo_NLWH | sorghum | 0.279 | 0.469 | 0.116 |
| 391 | Nyaruguru | female | 0.391 | 0.652 | 0.121 |
| 437 | Rutsiro | n_season_lime | 0.327 | 0.096 | 0.138 |
| 134 | Huye | cows | 1.161 | 0.774 | 0.144 |
| 224 | Kayonza | Total.N | 0.179 | 0.194 | 0.15 |
| 256 | Mugonero | climbing_bean | 0.321 | 0.192 | 0.159 |
| 363 | Nyanza | slope | 7.362 | 9.239 | 0.159 |
| 491 | Rwamagana | sorghum | 0.188 | 0.327 | 0.159 |
| 396 | Nyaruguru | n_season_fert | 2.130 | 1.065 | 0.174 |
| 81 | Gisagara | hhsize | 5.348 | 4.326 | 0.179 |
| 98 | Gisagara | pigs | 0.609 | 0.304 | 0.179 |
| 137 | Huye | pigs | 0.964 | 0.748 | 0.179 |
| 152 | Huye | own.pigs | 0.759 | 0.617 | 0.179 |
| 324 | Nyamasheke | slope | 14.315 | 17.185 | 0.179 |
| 426 | Nyaruguru | own.sheep | 0.109 | 0.000 | 0.19 |
| 276 | Nyamagabe | hhsize | 5.482 | 4.673 | 0.206 |
| 6 | Gatsibo_LWH | n_season_fert | 2.552 | 1.500 | 0.217 |
| 450 | Rutsiro | sheep | 0.467 | 0.247 | 0.217 |
| 120 | Huye | hhsize | 5.134 | 4.574 | 0.223 |
| 140 | Huye | sorghum | 0.241 | 0.374 | 0.223 |
| 500 | Rwamagana | own.cows | 0.545 | 0.400 | 0.227 |
| 245 | Mugonero | n_seasons_leg_2 | 3.131 | 4.077 | 0.231 |
| 128 | Huye | n_seasons_leg_2 | 3.732 | 4.860 | 0.243 |
| 50 | Gatsibo_NLWH | n_seasons_leg_2 | 2.977 | 3.889 | 0.244 |
| 292 | Nyamagabe | chickens | 0.857 | 0.309 | 0.244 |
| 463 | Rutsiro | own.chickens | 0.291 | 0.193 | 0.254 |
| 87 | Gisagara | n_season_fallow | 0.500 | 0.087 | 0.255 |
| 89 | Gisagara | n_seasons_leg_2 | 2.370 | 3.283 | 0.255 |
| 191 | Karongi | own.pigs | 0.401 | 0.291 | 0.255 |
| 307 | Nyamagabe | own.chickens | 0.286 | 0.127 | 0.255 |
| 457 | Rutsiro | pH | 5.275 | 5.396 | 0.255 |
| 59 | Gatsibo_NLWH | pigs | 0.244 | 0.062 | 0.262 |
| 207 | Kayonza | slope | 3.296 | 1.690 | 0.275 |
| 322 | Nyamasheke | n_seasons_leg_1 | 2.309 | 1.733 | 0.275 |
| 355 | Nyanza | own | 0.915 | 1.000 | 0.275 |
| 441 | Rutsiro | slope | 15.521 | 14.030 | 0.276 |
| 362 | Nyanza | n_seasons_leg_2 | 2.085 | 3.478 | 0.277 |
| 472 | Rwamagana | own | 0.911 | 0.973 | 0.285 |
| 445 | Rutsiro | hilltop | 0.097 | 0.042 | 0.293 |
| 84 | Gisagara | n_season_fert | 1.087 | 0.348 | 0.3 |
| 420 | Nyaruguru | Total.C | 2.056 | 2.176 | 0.3 |
| 164 | Karongi | n_season_lime | 0.099 | 0.013 | 0.31 |
| 275 | Nyamagabe | age | 43.768 | 49.018 | 0.31 |
| 485 | Rwamagana | cows | 1.062 | 0.518 | 0.31 |
| 108 | Gisagara | Total.C | 1.836 | 1.967 | 0.316 |
| 225 | Kayonza | Total.C | 2.355 | 2.520 | 0.316 |
| 411 | Nyaruguru | sheep | 0.196 | 0.000 | 0.316 |
| 96 | Gisagara | goats | 1.522 | 0.935 | 0.319 |
| 477 | Rwamagana | n_season_fallow | 1.214 | 0.718 | 0.319 |
| 382 | Nyanza | ExAl | 0.168 | 0.262 | 0.324 |
| 503 | Rwamagana | own.pigs | 0.277 | 0.173 | 0.324 |
| 487 | Rwamagana | chickens | 1.580 | 0.964 | 0.329 |
| 370 | Nyanza | chickens | 2.298 | 1.174 | 0.331 |
| 172 | Karongi | hilltop | 0.068 | 0.025 | 0.341 |
| 303 | Nyamagabe | Total.C | 2.282 | 2.160 | 0.341 |
| 347 | Nyamasheke | own.pigs | 0.409 | 0.308 | 0.341 |
| 407 | Nyaruguru | cows | 1.196 | 0.891 | 0.343 |
| 92 | Gisagara | valley | 0.283 | 0.130 | 0.346 |
| 203 | Kayonza | n_season_lime | 0.537 | 0.707 | 0.35 |
| 455 | Rutsiro | m3.Ca | 568.202 | 643.814 | 0.35 |
| 294 | Nyamagabe | sheep | 0.286 | 0.091 | 0.352 |
| 46 | Gatsibo_NLWH | n_season_compost | 2.547 | 3.420 | 0.353 |
| 226 | Kayonza | ExAl | 0.125 | 0.162 | 0.364 |
| 233 | Kayonza | red_soil | 0.204 | 0.086 | 0.364 |
| 348 | Nyamasheke | own.sheep | 0.060 | 0.021 | 0.367 |
| 142 | Huye | maize | 0.062 | 0.017 | 0.374 |
| 197 | Kayonza | age | 42.093 | 46.500 | 0.379 |
| 55 | Gatsibo_NLWH | hilltop | 0.012 | 0.062 | 0.384 |
| 216 | Kayonza | sheep | 0.037 | 0.172 | 0.384 |
| 221 | Kayonza | m3.Ca | 1685.356 | 1998.563 | 0.384 |
| 422 | Nyaruguru | own.cows | 0.826 | 0.674 | 0.397 |
| 79 | Gisagara | female | 0.435 | 0.609 | 0.403 |
| 425 | Nyaruguru | own.pigs | 0.609 | 0.435 | 0.403 |
| 173 | Karongi | cows | 1.383 | 1.127 | 0.404 |
| 312 | Nyamagabe | sandy_soil | 0.089 | 0.200 | 0.407 |
| 410 | Nyaruguru | pigs | 0.783 | 0.457 | 0.407 |
| 262 | Mugonero | pH | 5.075 | 5.177 | 0.43 |
| 375 | Nyanza | bush_bean | 0.255 | 0.413 | 0.436 |
| 266 | Mugonero | own.cows | 0.786 | 0.700 | 0.441 |
| 105 | Gisagara | m3.Mg | 218.396 | 247.896 | 0.446 |
| 223 | Kayonza | pH | 6.111 | 6.242 | 0.446 |
| 260 | Mugonero | m3.Ca | 458.124 | 521.824 | 0.446 |
| 390 | Nyanza | sandy_soil | 0.298 | 0.457 | 0.449 |
| 158 | Karongi | age | 44.741 | 47.285 | 0.453 |
| 180 | Karongi | bush_bean | 0.074 | 0.127 | 0.453 |
| 353 | Nyanza | age | 45.128 | 50.152 | 0.453 |
| 333 | Nyamasheke | sheep | 0.161 | 0.027 | 0.464 |
| 379 | Nyanza | pH | 6.049 | 5.875 | 0.464 |
| 323 | Nyamasheke | n_seasons_leg_2 | 2.101 | 2.660 | 0.469 |
| 424 | Nyaruguru | own.chickens | 0.283 | 0.152 | 0.478 |
| 446 | Rutsiro | cows | 1.442 | 1.048 | 0.478 |
| 460 | Rutsiro | ExAl | 0.807 | 0.706 | 0.493 |
| 33 | Gatsibo_LWH | own.goats | 0.552 | 0.414 | 0.493 |
| 185 | Karongi | Total.N | 0.138 | 0.133 | 0.493 |
| 74 | Gatsibo_NLWH | own.pigs | 0.128 | 0.062 | 0.496 |
| 104 | Gisagara | m3.Ca | 920.553 | 1100.991 | 0.496 |
| 153 | Huye | own.sheep | 0.054 | 0.017 | 0.496 |
| 436 | Rutsiro | n_season_compost | 8.176 | 7.639 | 0.496 |
| 293 | Nyamagabe | pigs | 1.000 | 0.800 | 0.499 |
| 438 | Rutsiro | n_season_fallow | 0.473 | 0.289 | 0.499 |
| 265 | Mugonero | ExAl | 0.928 | 0.814 | 0.516 |
| 103 | Gisagara | maize | 0.043 | 0.000 | 0.516 |
| 250 | Mugonero | hilltop | 0.015 | 0.000 | 0.516 |
| 297 | Nyamagabe | bush_bean | 0.000 | 0.036 | 0.516 |
| 377 | Nyanza | m3.Ca | 1119.490 | 966.731 | 0.516 |
| 398 | Nyaruguru | n_season_lime | 0.043 | 0.000 | 0.516 |
| 507 | Rwamagana | sandy_soil | 0.018 | 0.000 | 0.516 |
| 174 | Karongi | goats | 0.883 | 1.070 | 0.518 |
| 52 | Gatsibo_NLWH | alt | 1012.158 | 1129.172 | 0.521 |
| 102 | Gisagara | bush_bean | 0.217 | 0.348 | 0.521 |
| 118 | Huye | female | 0.500 | 0.591 | 0.521 |
| 199 | Kayonza | own | 0.963 | 0.897 | 0.521 |
| 231 | Kayonza | own.sheep | 0.037 | 0.103 | 0.521 |
| 349 | Nyamasheke | black_soil | 0.537 | 0.616 | 0.521 |
| 430 | Rutsiro | female | 0.545 | 0.620 | 0.521 |
| 109 | Gisagara | ExAl | 0.349 | 0.256 | 0.525 |
| 371 | Nyanza | pigs | 0.085 | 0.217 | 0.525 |
| 386 | Nyanza | own.pigs | 0.064 | 0.152 | 0.525 |
| 419 | Nyaruguru | Total.N | 0.153 | 0.161 | 0.525 |
| 117 | Gisagara | sandy_soil | 0.239 | 0.370 | 0.53 |
| 309 | Nyamagabe | own.sheep | 0.179 | 0.091 | 0.531 |
| 11 | Gatsibo_LWH | n_seasons_leg_2 | 1.983 | 2.737 | 0.531 |
| 32 | Gatsibo_LWH | own.cows | 0.500 | 0.379 | 0.531 |
| 58 | Gatsibo_NLWH | chickens | 1.674 | 1.000 | 0.531 |
| 66 | Gatsibo_NLWH | m3.Mg | 237.170 | 258.718 | 0.531 |
| 93 | Gisagara | hillside | 0.587 | 0.717 | 0.531 |
| 209 | Kayonza | valley | 0.556 | 0.431 | 0.531 |
| 214 | Kayonza | chickens | 1.000 | 0.466 | 0.531 |
| 220 | Kayonza | maize | 0.019 | 0.069 | 0.531 |
| 283 | Nyamagabe | n_seasons_leg_1 | 1.625 | 1.145 | 0.531 |
| 383 | Nyanza | own.cows | 0.660 | 0.522 | 0.531 |
| 394 | Nyaruguru | own | 0.848 | 0.935 | 0.531 |
| 465 | Rutsiro | own.sheep | 0.212 | 0.157 | 0.531 |
| 479 | Rwamagana | n_seasons_leg_2 | 1.324 | 1.700 | 0.531 |
| 488 | Rwamagana | pigs | 0.509 | 0.318 | 0.531 |
| 17 | Gatsibo_LWH | cows | 0.759 | 0.552 | 0.554 |
| 163 | Karongi | n_season_compost | 7.068 | 6.557 | 0.554 |
| 159 | Karongi | hhsize | 5.185 | 4.905 | 0.559 |
| 314 | Nyamasheke | age | 45.738 | 47.938 | 0.559 |
| 111 | Gisagara | own.goats | 0.630 | 0.500 | 0.56 |
| 136 | Huye | chickens | 1.795 | 1.348 | 0.56 |
| 160 | Karongi | own | 0.957 | 0.981 | 0.56 |
| 211 | Kayonza | hilltop | 0.093 | 0.172 | 0.563 |
| 339 | Nyamasheke | m3.Mg | 167.529 | 152.133 | 0.567 |
| 311 | Nyamagabe | red_soil | 0.464 | 0.582 | 0.567 |
| 127 | Huye | n_seasons_leg_1 | 1.393 | 1.043 | 0.571 |
| 143 | Huye | m3.Ca | 812.314 | 882.646 | 0.574 |
| 447 | Rutsiro | goats | 0.515 | 0.663 | 0.582 |
| 480 | Rwamagana | slope | 3.705 | 4.273 | 0.582 |
| 107 | Gisagara | Total.N | 0.142 | 0.149 | 0.586 |
| 76 | Gatsibo_NLWH | black_soil | 0.616 | 0.704 | 0.592 |
| 106 | Gisagara | pH | 5.706 | 5.842 | 0.6 |
| 171 | Karongi | hillside | 0.784 | 0.835 | 0.602 |
| 18 | Gatsibo_LWH | goats | 1.345 | 1.000 | 0.608 |
| 19 | Gatsibo_LWH | chickens | 1.052 | 0.690 | 0.608 |
| 146 | Huye | Total.N | 0.134 | 0.137 | 0.608 |
| 219 | Kayonza | bush_bean | 0.222 | 0.138 | 0.608 |
| 298 | Nyamagabe | maize | 0.089 | 0.036 | 0.608 |
| 389 | Nyanza | red_soil | 0.106 | 0.043 | 0.608 |
| 442 | Rutsiro | alt | 1945.122 | 1896.140 | 0.608 |
| 462 | Rutsiro | own.goats | 0.273 | 0.331 | 0.608 |
| 71 | Gatsibo_NLWH | own.cows | 0.570 | 0.481 | 0.61 |
| 374 | Nyanza | sorghum | 0.298 | 0.196 | 0.61 |
| 268 | Mugonero | own.chickens | 0.382 | 0.315 | 0.62 |
| 154 | Huye | black_soil | 0.509 | 0.583 | 0.627 |
| 64 | Gatsibo_NLWH | maize | 0.093 | 0.049 | 0.637 |
| 213 | Kayonza | goats | 1.759 | 1.397 | 0.637 |
| 344 | Nyamasheke | own.cows | 0.638 | 0.575 | 0.638 |
| 47 | Gatsibo_NLWH | n_season_lime | 0.058 | 0.025 | 0.641 |
| 228 | Kayonza | own.goats | 0.685 | 0.586 | 0.641 |
| 234 | Kayonza | sandy_soil | 0.037 | 0.086 | 0.641 |
| 57 | Gatsibo_NLWH | goats | 1.198 | 1.654 | 0.66 |
| 3 | Gatsibo_LWH | hhsize | 5.362 | 5.741 | 0.66 |
| 4 | Gatsibo_LWH | own | 0.983 | 1.000 | 0.66 |
| 56 | Gatsibo_NLWH | cows | 0.895 | 0.741 | 0.66 |
| 78 | Gatsibo_NLWH | sandy_soil | 0.279 | 0.210 | 0.66 |
| 88 | Gisagara | n_seasons_leg_1 | 2.304 | 2.870 | 0.66 |
| 115 | Gisagara | black_soil | 0.587 | 0.478 | 0.66 |
| 121 | Huye | own | 0.973 | 0.991 | 0.66 |
| 125 | Huye | n_season_lime | 0.036 | 0.113 | 0.66 |
| 184 | Karongi | pH | 5.423 | 5.481 | 0.66 |
| 186 | Karongi | Total.C | 1.898 | 1.851 | 0.66 |
| 200 | Kayonza | field.size | 572.833 | 423.776 | 0.66 |
| 217 | Kayonza | climbing_bean | 0.000 | 0.017 | 0.66 |
| 235 | Mugonero | female | 0.656 | 0.715 | 0.66 |
| 238 | Mugonero | own | 0.947 | 0.915 | 0.66 |
| 259 | Mugonero | maize | 0.000 | 0.008 | 0.66 |
| 350 | Nyamasheke | red_soil | 0.315 | 0.260 | 0.66 |
| 368 | Nyanza | cows | 1.170 | 0.891 | 0.66 |
| 376 | Nyanza | maize | 0.128 | 0.065 | 0.66 |
| 395 | Nyaruguru | field.size | 570.022 | 495.239 | 0.66 |
| 414 | Nyaruguru | bush_bean | 0.065 | 0.022 | 0.66 |
| 415 | Nyaruguru | maize | 0.022 | 0.000 | 0.66 |
| 434 | Rutsiro | field.size | 584.327 | 429.614 | 0.66 |
| 443 | Rutsiro | valley | 0.152 | 0.193 | 0.66 |
| 449 | Rutsiro | pigs | 0.315 | 0.241 | 0.66 |
| 489 | Rwamagana | sheep | 0.500 | 0.364 | 0.66 |
| 493 | Rwamagana | maize | 0.027 | 0.009 | 0.66 |
| 251 | Mugonero | cows | 1.328 | 1.169 | 0.664 |
| 456 | Rutsiro | m3.Mg | 163.400 | 173.586 | 0.664 |
| 178 | Karongi | climbing_bean | 0.407 | 0.354 | 0.664 |
| 218 | Kayonza | sorghum | 0.167 | 0.103 | 0.669 |
| 8 | Gatsibo_LWH | n_season_lime | 0.500 | 0.328 | 0.678 |
| 37 | Gatsibo_LWH | black_soil | 0.172 | 0.241 | 0.678 |
| 38 | Gatsibo_LWH | red_soil | 0.638 | 0.552 | 0.678 |
| 53 | Gatsibo_NLWH | valley | 0.756 | 0.691 | 0.678 |
| 83 | Gisagara | field.size | 493.957 | 559.261 | 0.678 |
| 135 | Huye | goats | 0.955 | 0.809 | 0.678 |
| 145 | Huye | pH | 5.557 | 5.616 | 0.678 |
| 167 | Karongi | n_seasons_leg_2 | 3.174 | 3.558 | 0.678 |
| 182 | Karongi | m3.Ca | 735.109 | 793.491 | 0.678 |
| 183 | Karongi | m3.Mg | 238.490 | 256.902 | 0.678 |
| 239 | Mugonero | field.size | 441.359 | 391.208 | 0.678 |
| 290 | Nyamagabe | cows | 0.911 | 0.727 | 0.678 |
| 325 | Nyamasheke | alt | 1676.658 | 1702.940 | 0.678 |
| 351 | Nyamasheke | sandy_soil | 0.114 | 0.082 | 0.678 |
| 385 | Nyanza | own.chickens | 0.532 | 0.435 | 0.678 |
| 409 | Nyaruguru | chickens | 0.652 | 0.391 | 0.678 |
| 421 | Nyaruguru | ExAl | 0.851 | 0.956 | 0.678 |
| 454 | Rutsiro | maize | 0.176 | 0.217 | 0.678 |
| 5 | Gatsibo_LWH | field.size | 776.345 | 919.724 | 0.688 |
| 492 | Rwamagana | bush_bean | 0.366 | 0.309 | 0.688 |
| 48 | Gatsibo_NLWH | n_season_fallow | 0.744 | 0.543 | 0.694 |
| 91 | Gisagara | alt | 1036.055 | 923.525 | 0.694 |
| 187 | Karongi | ExAl | 0.631 | 0.573 | 0.694 |
| 308 | Nyamagabe | own.pigs | 0.732 | 0.655 | 0.694 |
| 313 | Nyamasheke | female | 0.644 | 0.692 | 0.7 |
| 332 | Nyamasheke | pigs | 0.779 | 0.582 | 0.7 |
| 418 | Nyaruguru | pH | 5.049 | 4.957 | 0.7 |
| 150 | Huye | own.goats | 0.518 | 0.461 | 0.706 |
| 141 | Huye | bush_bean | 0.196 | 0.243 | 0.706 |
| 423 | Nyaruguru | own.goats | 0.348 | 0.435 | 0.711 |
| 31 | Gatsibo_LWH | ExAl | 0.174 | 0.203 | 0.714 |
| 405 | Nyaruguru | hillside | 0.957 | 0.913 | 0.714 |
| 406 | Nyaruguru | hilltop | 0.043 | 0.087 | 0.714 |
| 101 | Gisagara | sorghum | 0.435 | 0.522 | 0.72 |
| 254 | Mugonero | pigs | 0.412 | 0.331 | 0.722 |
| 340 | Nyamasheke | pH | 5.136 | 5.089 | 0.727 |
| 155 | Huye | red_soil | 0.179 | 0.139 | 0.729 |
| 34 | Gatsibo_LWH | own.chickens | 0.345 | 0.276 | 0.738 |
| 73 | Gatsibo_NLWH | own.chickens | 0.302 | 0.247 | 0.738 |
| 189 | Karongi | own.goats | 0.494 | 0.538 | 0.743 |
| 270 | Mugonero | own.sheep | 0.122 | 0.092 | 0.752 |
| 403 | Nyaruguru | alt | 1791.131 | 1814.738 | 0.752 |
| 63 | Gatsibo_NLWH | bush_bean | 0.174 | 0.222 | 0.754 |
| 364 | Nyanza | alt | 1508.930 | 1529.554 | 0.754 |
| 504 | Rwamagana | own.sheep | 0.223 | 0.182 | 0.754 |
| 10 | Gatsibo_LWH | n_seasons_leg_1 | 4.345 | 4.690 | 0.756 |
| 80 | Gisagara | age | 44.761 | 46.848 | 0.756 |
| 126 | Huye | n_season_fallow | 0.661 | 0.487 | 0.756 |
| 133 | Huye | hilltop | 0.045 | 0.026 | 0.756 |
| 138 | Huye | sheep | 0.062 | 0.035 | 0.756 |
| 194 | Karongi | red_soil | 0.327 | 0.367 | 0.756 |
| 215 | Kayonza | pigs | 0.315 | 0.224 | 0.756 |
| 269 | Mugonero | own.pigs | 0.244 | 0.285 | 0.756 |
| 272 | Mugonero | red_soil | 0.252 | 0.292 | 0.756 |
| 278 | Nyamagabe | field.size | 310.929 | 283.782 | 0.756 |
| 291 | Nyamagabe | goats | 0.750 | 0.618 | 0.756 |
| 429 | Nyaruguru | sandy_soil | 0.109 | 0.065 | 0.756 |
| 475 | Rwamagana | n_season_compost | 4.009 | 3.691 | 0.756 |
| 72 | Gatsibo_NLWH | own.goats | 0.512 | 0.568 | 0.757 |
| 124 | Huye | n_season_compost | 5.232 | 4.878 | 0.757 |
| 176 | Karongi | pigs | 0.531 | 0.449 | 0.757 |
| 161 | Karongi | field.size | 492.262 | 448.278 | 0.76 |
| 384 | Nyanza | own.goats | 0.617 | 0.543 | 0.76 |
| 388 | Nyanza | black_soil | 0.574 | 0.500 | 0.76 |
| 354 | Nyanza | hhsize | 5.213 | 4.891 | 0.763 |
| 360 | Nyanza | n_season_fallow | 1.149 | 0.870 | 0.769 |
| 499 | Rwamagana | ExAl | 0.348 | 0.320 | 0.779 |
| 20 | Gatsibo_LWH | pigs | 0.138 | 0.207 | 0.787 |
| 60 | Gatsibo_NLWH | sheep | 0.047 | 0.025 | 0.787 |
| 227 | Kayonza | own.cows | 0.370 | 0.310 | 0.787 |
| 229 | Kayonza | own.chickens | 0.185 | 0.138 | 0.787 |
| 230 | Kayonza | own.pigs | 0.222 | 0.172 | 0.787 |
| 317 | Nyamasheke | field.size | 791.856 | 620.884 | 0.787 |
| 427 | Nyaruguru | black_soil | 0.630 | 0.696 | 0.787 |
| 467 | Rutsiro | red_soil | 0.315 | 0.349 | 0.787 |
| 473 | Rwamagana | field.size | 918.893 | 837.927 | 0.787 |
| 26 | Gatsibo_LWH | m3.Ca | 1281.010 | 1186.023 | 0.788 |
| 42 | Gatsibo_NLWH | hhsize | 5.349 | 5.160 | 0.788 |
| 147 | Huye | Total.C | 1.869 | 1.895 | 0.788 |
| 244 | Mugonero | n_seasons_leg_1 | 1.336 | 1.177 | 0.788 |
| 282 | Nyamagabe | n_season_fallow | 0.268 | 0.182 | 0.788 |
| 378 | Nyanza | m3.Mg | 242.303 | 230.433 | 0.788 |
| 433 | Rutsiro | own | 0.915 | 0.934 | 0.788 |
| 85 | Gisagara | n_season_compost | 4.326 | 3.957 | 0.791 |
| 70 | Gatsibo_NLWH | ExAl | 0.202 | 0.220 | 0.793 |
| 44 | Gatsibo_NLWH | field.size | 1319.302 | 1459.901 | 0.796 |
| 468 | Rutsiro | sandy_soil | 0.176 | 0.151 | 0.796 |
| 408 | Nyaruguru | goats | 0.609 | 0.739 | 0.805 |
| 486 | Rwamagana | goats | 1.768 | 1.582 | 0.805 |
| 23 | Gatsibo_LWH | sorghum | 0.276 | 0.328 | 0.805 |
| 372 | Nyanza | sheep | 0.021 | 0.043 | 0.805 |
| 387 | Nyanza | own.sheep | 0.021 | 0.043 | 0.805 |
| 496 | Rwamagana | pH | 5.697 | 5.735 | 0.805 |
| 27 | Gatsibo_LWH | m3.Mg | 253.408 | 241.489 | 0.812 |
| 36 | Gatsibo_LWH | own.sheep | 0.034 | 0.017 | 0.812 |
| 399 | Nyaruguru | n_season_fallow | 0.370 | 0.261 | 0.812 |
| 24 | Gatsibo_LWH | bush_bean | 0.397 | 0.345 | 0.815 |
| 82 | Gisagara | own | 0.826 | 0.870 | 0.815 |
| 277 | Nyamagabe | own | 0.964 | 0.982 | 0.819 |
| 373 | Nyanza | climbing_bean | 0.043 | 0.022 | 0.819 |
| 271 | Mugonero | black_soil | 0.527 | 0.492 | 0.821 |
| 431 | Rutsiro | age | 43.685 | 44.614 | 0.821 |
| 498 | Rwamagana | Total.C | 2.161 | 2.188 | 0.821 |
| 501 | Rwamagana | own.goats | 0.536 | 0.573 | 0.821 |
| 90 | Gisagara | slope | 8.391 | 7.739 | 0.823 |
| 9 | Gatsibo_LWH | n_season_fallow | 0.397 | 0.293 | 0.834 |
| 190 | Karongi | own.chickens | 0.407 | 0.437 | 0.835 |
| 338 | Nyamasheke | m3.Ca | 504.693 | 482.970 | 0.836 |
| 28 | Gatsibo_LWH | pH | 6.055 | 6.005 | 0.837 |
| 188 | Karongi | own.cows | 0.765 | 0.741 | 0.839 |
| 206 | Kayonza | n_seasons_leg_2 | 1.111 | 1.259 | 0.839 |
| 122 | Huye | field.size | 599.607 | 567.922 | 0.845 |
| 205 | Kayonza | n_seasons_leg_1 | 2.019 | 2.155 | 0.845 |
| 281 | Nyamagabe | n_season_lime | 0.500 | 0.400 | 0.845 |
| 412 | Nyaruguru | climbing_bean | 0.196 | 0.239 | 0.845 |
| 39 | Gatsibo_LWH | sandy_soil | 0.172 | 0.207 | 0.851 |
| 77 | Gatsibo_NLWH | red_soil | 0.035 | 0.049 | 0.851 |
| 192 | Karongi | own.sheep | 0.080 | 0.095 | 0.851 |
| 210 | Kayonza | hillside | 0.352 | 0.397 | 0.851 |
| 253 | Mugonero | chickens | 1.076 | 0.954 | 0.851 |
| 286 | Nyamagabe | alt | 2124.389 | 2110.970 | 0.851 |
| 288 | Nyamagabe | hillside | 0.821 | 0.855 | 0.851 |
| 296 | Nyamagabe | sorghum | 0.036 | 0.055 | 0.851 |
| 301 | Nyamagabe | pH | 4.838 | 4.869 | 0.851 |
| 306 | Nyamagabe | own.goats | 0.464 | 0.418 | 0.851 |
| 397 | Nyaruguru | n_season_compost | 5.696 | 6.043 | 0.851 |
| 481 | Rwamagana | alt | 990.930 | 956.133 | 0.851 |
| 505 | Rwamagana | black_soil | 0.652 | 0.682 | 0.851 |
| 246 | Mugonero | slope | 15.939 | 16.462 | 0.853 |
| 267 | Mugonero | own.goats | 0.542 | 0.569 | 0.862 |
| 400 | Nyaruguru | n_seasons_leg_1 | 2.043 | 2.348 | 0.862 |
| 482 | Rwamagana | valley | 0.402 | 0.373 | 0.862 |
| 151 | Huye | own.chickens | 0.420 | 0.391 | 0.865 |
| 289 | Nyamagabe | hilltop | 0.054 | 0.036 | 0.865 |
| 343 | Nyamasheke | ExAl | 1.006 | 1.037 | 0.868 |
| 13 | Gatsibo_LWH | alt | 1094.532 | 1135.001 | 0.872 |
| 316 | Nyamasheke | own | 0.926 | 0.938 | 0.873 |
| 483 | Rwamagana | hillside | 0.536 | 0.564 | 0.873 |
| 476 | Rwamagana | n_season_lime | 0.348 | 0.318 | 0.887 |
| 75 | Gatsibo_NLWH | own.sheep | 0.035 | 0.025 | 0.894 |
| 284 | Nyamagabe | n_seasons_leg_2 | 3.161 | 2.982 | 0.894 |
| 356 | Nyanza | field.size | 838.926 | 766.217 | 0.894 |
| 361 | Nyanza | n_seasons_leg_1 | 2.511 | 2.283 | 0.894 |
| 466 | Rutsiro | black_soil | 0.461 | 0.440 | 0.894 |
| 144 | Huye | m3.Mg | 215.213 | 211.384 | 0.894 |
| 336 | Nyamasheke | bush_bean | 0.087 | 0.075 | 0.894 |
| 129 | Huye | slope | 6.616 | 6.817 | 0.903 |
| 490 | Rwamagana | climbing_bean | 0.036 | 0.027 | 0.903 |
| 177 | Karongi | sheep | 0.167 | 0.196 | 0.905 |
| 195 | Karongi | sandy_soil | 0.025 | 0.019 | 0.908 |
| 264 | Mugonero | Total.C | 2.147 | 2.171 | 0.912 |
| 280 | Nyamagabe | n_season_compost | 7.857 | 7.655 | 0.912 |
| 300 | Nyamagabe | m3.Mg | 115.173 | 112.262 | 0.913 |
| 212 | Kayonza | cows | 0.685 | 0.828 | 0.913 |
| 249 | Mugonero | hillside | 0.855 | 0.869 | 0.913 |
| 67 | Gatsibo_NLWH | pH | 6.011 | 5.989 | 0.917 |
| 222 | Kayonza | m3.Mg | 347.261 | 354.791 | 0.917 |
| 304 | Nyamagabe | ExAl | 1.150 | 1.119 | 0.918 |
| 464 | Rutsiro | own.pigs | 0.194 | 0.181 | 0.926 |
| 94 | Gisagara | hilltop | 0.130 | 0.152 | 0.93 |
| 116 | Gisagara | red_soil | 0.152 | 0.130 | 0.93 |
| 416 | Nyaruguru | m3.Ca | 415.882 | 434.099 | 0.93 |
| 35 | Gatsibo_LWH | own.pigs | 0.121 | 0.103 | 0.932 |
| 444 | Rutsiro | hillside | 0.752 | 0.765 | 0.934 |
| 7 | Gatsibo_LWH | n_season_compost | 5.672 | 5.879 | 0.936 |
| 401 | Nyaruguru | n_seasons_leg_2 | 3.178 | 3.370 | 0.937 |
| 471 | Rwamagana | hhsize | 4.920 | 5.000 | 0.938 |
| 241 | Mugonero | n_season_compost | 6.229 | 6.100 | 0.938 |
| 345 | Nyamasheke | own.goats | 0.430 | 0.445 | 0.938 |
| 287 | Nyamagabe | valley | 0.125 | 0.109 | 0.946 |
| 478 | Rwamagana | n_seasons_leg_1 | 1.652 | 1.582 | 0.946 |
| 175 | Karongi | chickens | 1.463 | 1.386 | 0.949 |
| 232 | Kayonza | black_soil | 0.685 | 0.707 | 0.949 |
| 257 | Mugonero | sorghum | 0.061 | 0.054 | 0.949 |
| 428 | Nyaruguru | red_soil | 0.261 | 0.239 | 0.954 |
| 29 | Gatsibo_LWH | Total.N | 0.149 | 0.148 | 0.956 |
| 54 | Gatsibo_NLWH | hillside | 0.233 | 0.247 | 0.956 |
| 130 | Huye | alt | 1649.310 | 1655.403 | 0.956 |
| 148 | Huye | ExAl | 0.290 | 0.298 | 0.956 |
| 170 | Karongi | valley | 0.148 | 0.139 | 0.956 |
| 295 | Nyamagabe | climbing_bean | 0.161 | 0.145 | 0.956 |
| 342 | Nyamasheke | Total.C | 2.242 | 2.256 | 0.956 |
| 381 | Nyanza | Total.C | 1.763 | 1.747 | 0.956 |
| 198 | Kayonza | hhsize | 5.074 | 5.155 | 0.956 |
| 139 | Huye | climbing_bean | 0.089 | 0.096 | 0.96 |
| 156 | Huye | sandy_soil | 0.268 | 0.278 | 0.96 |
| 193 | Karongi | black_soil | 0.580 | 0.570 | 0.96 |
| 243 | Mugonero | n_season_fallow | 0.863 | 0.908 | 0.96 |
| 247 | Mugonero | alt | 1796.175 | 1787.253 | 0.96 |
| 261 | Mugonero | m3.Mg | 162.889 | 165.293 | 0.96 |
| 299 | Nyamagabe | m3.Ca | 329.187 | 335.265 | 0.96 |
| 330 | Nyamasheke | goats | 0.899 | 0.870 | 0.96 |
| 331 | Nyamasheke | chickens | 0.591 | 0.562 | 0.96 |
| 341 | Nyamasheke | Total.N | 0.162 | 0.161 | 0.96 |
| 358 | Nyanza | n_season_compost | 3.191 | 3.326 | 0.96 |
| 369 | Nyanza | goats | 1.234 | 1.283 | 0.96 |
| 417 | Nyaruguru | m3.Mg | 146.273 | 144.478 | 0.96 |
| 440 | Rutsiro | n_seasons_leg_2 | 2.036 | 2.091 | 0.96 |
| 448 | Rutsiro | chickens | 0.879 | 0.964 | 0.96 |
| 458 | Rutsiro | Total.N | 0.149 | 0.150 | 0.96 |
| 494 | Rwamagana | m3.Ca | 1230.875 | 1244.512 | 0.96 |
| 506 | Rwamagana | red_soil | 0.330 | 0.318 | 0.96 |
| 21 | Gatsibo_LWH | sheep | 0.103 | 0.086 | 0.971 |
| 252 | Mugonero | goats | 1.321 | 1.292 | 0.973 |
| 439 | Rutsiro | n_seasons_leg_1 | 3.388 | 3.440 | 0.974 |
| 30 | Gatsibo_LWH | Total.C | 1.915 | 1.906 | 0.999 |
| 65 | Gatsibo_NLWH | m3.Ca | 1198.728 | 1210.672 | 0.999 |
| 166 | Karongi | n_seasons_leg_1 | 2.438 | 2.468 | 0.999 |
| 179 | Karongi | sorghum | 0.148 | 0.152 | 0.999 |
| 202 | Kayonza | n_season_compost | 3.481 | 3.534 | 0.999 |
| 204 | Kayonza | n_season_fallow | 0.463 | 0.483 | 0.999 |
| 255 | Mugonero | sheep | 0.221 | 0.231 | 0.999 |
| 321 | Nyamasheke | n_season_fallow | 0.651 | 0.671 | 0.999 |
| 326 | Nyamasheke | valley | 0.154 | 0.151 | 0.999 |
| 346 | Nyamasheke | own.chickens | 0.215 | 0.219 | 0.999 |
| 495 | Rwamagana | m3.Mg | 268.544 | 267.492 | 0.999 |
| 12 | Gatsibo_LWH | slope | 4.259 | 4.224 | 1 |
| 14 | Gatsibo_LWH | valley | 0.603 | 0.603 | 1 |
| 15 | Gatsibo_LWH | hillside | 0.397 | 0.397 | 1 |
| 49 | Gatsibo_NLWH | n_seasons_leg_1 | 1.547 | 1.556 | 1 |
| 51 | Gatsibo_NLWH | slope | 4.058 | 4.099 | 1 |
| 68 | Gatsibo_NLWH | Total.N | 0.159 | 0.159 | 1 |
| 69 | Gatsibo_NLWH | Total.C | 2.009 | 2.012 | 1 |
| 86 | Gisagara | n_season_lime | 0.022 | 0.022 | 1 |
| 99 | Gisagara | sheep | 0.022 | 0.022 | 1 |
| 100 | Gisagara | climbing_bean | 0.022 | 0.022 | 1 |
| 114 | Gisagara | own.sheep | 0.022 | 0.022 | 1 |
| 165 | Karongi | n_season_fallow | 0.840 | 0.848 | 1 |
| 168 | Karongi | slope | 12.006 | 11.975 | 1 |
| 169 | Karongi | alt | 1807.818 | 1806.996 | 1 |
| 208 | Kayonza | alt | 1203.816 | 1202.646 | 1 |
| 248 | Mugonero | valley | 0.130 | 0.131 | 1 |
| 258 | Mugonero | bush_bean | 0.176 | 0.177 | 1 |
| 263 | Mugonero | Total.N | 0.145 | 0.145 | 1 |
| 273 | Mugonero | sandy_soil | 0.176 | 0.177 | 1 |
| 285 | Nyamagabe | slope | 12.571 | 12.618 | 1 |
| 302 | Nyamagabe | Total.N | 0.153 | 0.153 | 1 |
| 327 | Nyamasheke | hillside | 0.819 | 0.822 | 1 |
| 328 | Nyamasheke | hilltop | 0.027 | 0.027 | 1 |
| 329 | Nyamasheke | cows | 1.242 | 1.267 | 1 |
| 380 | Nyanza | Total.N | 0.126 | 0.126 | 1 |
| 392 | Nyaruguru | age | 48.457 | 48.304 | 1 |
| 402 | Nyaruguru | slope | 9.478 | 9.457 | 1 |
| 413 | Nyaruguru | sorghum | 0.304 | 0.304 | 1 |
| 451 | Rutsiro | climbing_bean | 0.327 | 0.331 | 1 |
| 453 | Rutsiro | bush_bean | 0.012 | 0.012 | 1 |
| 459 | Rutsiro | Total.C | 2.187 | 2.187 | 1 |
| 484 | Rwamagana | hilltop | 0.062 | 0.064 | 1 |
| 497 | Rwamagana | Total.N | 0.182 | 0.182 | 1 |
| 16 | Gatsibo_LWH | hilltop | 0.000 | 0.000 | NA |
| 22 | Gatsibo_LWH | climbing_bean | 0.000 | 0.000 | NA |
| 25 | Gatsibo_LWH | maize | 0.000 | 0.000 | NA |
| 61 | Gatsibo_NLWH | climbing_bean | 0.000 | 0.000 | NA |
| 335 | Nyamasheke | sorghum | 0.000 | 0.000 | NA |
| 359 | Nyanza | n_season_lime | 0.000 | 0.000 | NA |
| 367 | Nyanza | hilltop | 0.000 | 0.000 | NA |
| 404 | Nyaruguru | valley | 0.000 | 0.000 | NA |
| 452 | Rutsiro | sorghum | 0.000 | 0.000 | NA |
Demographic variables interpretation here.
Agricultural practice variables interpretation here
Soil Variables interpretation here
write.csv(dist.output, file=paste("output", "district balance.csv", sep="/"), row.names=T)
Look at farmers by duration of tenure farming with Tubura. We want to understand, at least with an initial naive baseline sense, what is the cumulative effect of Tubura practices on soil health outcomes?
We will look only at current Tubura farmers and compare first year farmers to farmers with more experience with Tubura.
oafOnly <- d[which(d$client==1 & d$total.seasons>=1),]
nTenure <- oafOnly %>% group_by(total.seasons) %>%
dplyr::summarize(
n = n()
) %>% ungroup() %>% as.data.frame()
nTenure$val <- paste(nTenure$total.seasons, " (", "n = ", nTenure$n, ")", sep = "")
for(i in 1:length(soilVars)){
print(
ggplot(oafOnly, aes(x=as.factor(total.seasons), y=oafOnly[,soilVars[i]])) +
geom_boxplot() +
scale_x_discrete(labels=nTenure$val) +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Tubura Tenure", y=soilVars[i], title = paste("RW baseline soil by tenure - ", soilVars[i], sep = ""))
)
}
tenureSum <- aggregate(oafOnly[,out.list], by=list(oafOnly$total.seasons), function(x){
round(mean(x, na.rm=T),2)
})
tenureSum <- as.data.frame(t(tenureSum))
colnames(tenureSum) <- c(paste(seq(1,14,1), " seas.", sep = ""))
print(kable(tenureSum))
| 1 seas. | 2 seas. | 3 seas. | 4 seas. | 5 seas. | 6 seas. | 7 seas. | 8 seas. | 9 seas. | 10 seas. | 11 seas. | 12 seas. | 13 seas. | 14 seas. | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Group.1 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 | 6.00 | 7.00 | 8.00 | 9.00 | 10.00 | 11.00 | 12.00 | 13.00 | 14.00 |
| female | 0.43 | 0.48 | 0.60 | 0.49 | 0.58 | 0.59 | 0.54 | 0.50 | 0.65 | 0.60 | 0.75 | 0.54 | 0.55 | 0.48 |
| age | 42.52 | 42.77 | 44.33 | 45.68 | 43.07 | 43.96 | 43.71 | 50.91 | 51.85 | 42.67 | 48.25 | 49.00 | 52.73 | 49.52 |
| hhsize | 5.23 | 5.19 | 5.08 | 5.22 | 6.07 | 5.58 | 6.11 | 5.66 | 6.65 | 6.13 | 6.58 | 5.51 | 4.55 | 6.12 |
| own | 0.93 | 0.95 | 0.92 | 0.95 | 0.98 | 0.97 | 1.00 | 0.93 | 0.85 | 0.97 | 0.92 | 0.90 | 0.91 | 0.95 |
| field.size | 570.34 | 634.79 | 630.65 | 554.01 | 486.51 | 484.79 | 412.07 | 592.95 | 330.00 | 644.30 | 617.83 | 444.85 | 280.82 | 679.71 |
| n_season_fert | 1.78 | 2.33 | 2.63 | 3.29 | 4.67 | 4.90 | 5.93 | 5.59 | 6.00 | 5.63 | 7.25 | 7.08 | 7.18 | 6.79 |
| n_season_compost | 5.36 | 5.33 | 4.81 | 6.36 | 6.84 | 7.13 | 8.36 | 7.43 | 7.30 | 7.33 | 8.67 | 8.15 | 8.27 | 7.81 |
| n_season_lime | 0.17 | 0.27 | 0.36 | 0.31 | 0.22 | 0.26 | 0.07 | 0.07 | 0.20 | 0.27 | 0.08 | 0.15 | 0.09 | 0.48 |
| n_season_fallow | 0.64 | 0.50 | 0.77 | 0.91 | 0.67 | 0.66 | 0.64 | 0.77 | 1.10 | 0.47 | 0.42 | 0.23 | 1.82 | 0.67 |
| n_seasons_leg_1 | 1.99 | 2.04 | 2.10 | 2.44 | 2.93 | 2.89 | 1.93 | 2.91 | 1.60 | 1.77 | 2.92 | 3.54 | 2.09 | 3.02 |
| n_seasons_leg_2 | 2.47 | 2.48 | 2.03 | 2.34 | 2.71 | 2.62 | 4.75 | 3.50 | 2.75 | 2.50 | 3.92 | 1.45 | 2.82 | 2.12 |
| slope | 9.07 | 8.71 | 9.65 | 8.80 | 9.58 | 11.84 | 14.29 | 11.77 | 13.60 | 12.07 | 14.17 | 12.00 | 14.91 | 14.38 |
| alt | 1536.57 | 1491.09 | 1550.41 | 1516.91 | 1650.84 | 1582.27 | 1758.60 | 1639.74 | 1812.98 | 1755.70 | 1788.95 | 1644.45 | 1702.35 | 1633.76 |
| valley | 0.32 | 0.32 | 0.27 | 0.26 | 0.31 | 0.22 | 0.25 | 0.23 | 0.00 | 0.10 | 0.33 | 0.18 | 0.09 | 0.21 |
| hillside | 0.62 | 0.62 | 0.71 | 0.71 | 0.67 | 0.74 | 0.71 | 0.66 | 0.95 | 0.87 | 0.67 | 0.79 | 0.82 | 0.76 |
| hilltop | 0.06 | 0.06 | 0.03 | 0.03 | 0.02 | 0.05 | 0.04 | 0.11 | 0.05 | 0.03 | 0.00 | 0.03 | 0.09 | 0.02 |
| cows | 0.97 | 1.00 | 0.94 | 1.49 | 1.93 | 1.28 | 1.29 | 1.09 | 3.25 | 1.27 | 1.75 | 1.00 | 1.00 | 1.07 |
| goats | 0.99 | 1.11 | 1.00 | 1.27 | 1.40 | 1.00 | 0.75 | 1.16 | 1.20 | 0.87 | 0.83 | 1.08 | 0.45 | 1.26 |
| chickens | 1.08 | 1.19 | 1.08 | 1.21 | 1.31 | 0.87 | 1.71 | 1.61 | 1.20 | 1.40 | 3.58 | 1.72 | 0.45 | 1.00 |
| pigs | 0.54 | 0.52 | 0.58 | 0.52 | 0.38 | 0.47 | 0.46 | 0.55 | 0.45 | 0.60 | 0.25 | 0.82 | 0.64 | 0.83 |
| sheep | 0.20 | 0.21 | 0.29 | 0.35 | 0.18 | 0.16 | 0.86 | 0.23 | 0.15 | 0.40 | 0.00 | 0.05 | 0.18 | 0.17 |
| climbing_bean | 0.14 | 0.18 | 0.21 | 0.17 | 0.20 | 0.32 | 0.50 | 0.36 | 0.35 | 0.43 | 0.33 | 0.38 | 0.55 | 0.38 |
| sorghum | 0.17 | 0.17 | 0.12 | 0.14 | 0.11 | 0.06 | 0.04 | 0.02 | 0.05 | 0.10 | 0.17 | 0.05 | 0.09 | 0.00 |
| bush_bean | 0.14 | 0.17 | 0.17 | 0.17 | 0.16 | 0.19 | 0.04 | 0.18 | 0.15 | 0.03 | 0.08 | 0.10 | 0.18 | 0.14 |
| maize | 0.06 | 0.07 | 0.06 | 0.08 | 0.09 | 0.08 | 0.11 | 0.11 | 0.10 | 0.10 | 0.08 | 0.03 | 0.00 | 0.05 |
| m3.Ca | 825.38 | 808.59 | 892.40 | 829.36 | 938.66 | 801.09 | 731.73 | 692.21 | 447.86 | 551.92 | 604.23 | 813.76 | 556.78 | 544.69 |
| m3.Mg | 211.62 | 199.21 | 226.45 | 211.18 | 230.07 | 217.20 | 207.29 | 222.62 | 143.91 | 183.00 | 209.25 | 281.07 | 158.64 | 177.75 |
| pH | 5.47 | 5.50 | 5.49 | 5.46 | 5.59 | 5.48 | 5.41 | 5.42 | 5.06 | 5.21 | 5.33 | 5.46 | 5.27 | 5.19 |
| Total.N | 0.15 | 0.16 | 0.16 | 0.16 | 0.15 | 0.15 | 0.14 | 0.13 | 0.16 | 0.15 | 0.13 | 0.13 | 0.15 | 0.15 |
| Total.C | 2.07 | 2.10 | 2.09 | 2.09 | 2.17 | 2.05 | 2.04 | 1.88 | 2.33 | 2.11 | 1.92 | 1.92 | 2.06 | 2.10 |
| ExAl | 0.58 | 0.59 | 0.63 | 0.60 | 0.50 | 0.65 | 0.67 | 0.56 | 1.01 | 0.81 | 0.68 | 0.54 | 0.81 | 0.81 |
| own.cows | 0.69 | 0.63 | 0.58 | 0.74 | 0.71 | 0.69 | 0.82 | 0.70 | 0.90 | 0.87 | 0.75 | 0.64 | 0.64 | 0.81 |
| own.goats | 0.46 | 0.52 | 0.47 | 0.46 | 0.53 | 0.50 | 0.46 | 0.55 | 0.50 | 0.47 | 0.50 | 0.51 | 0.27 | 0.52 |
| own.chickens | 0.30 | 0.35 | 0.33 | 0.30 | 0.33 | 0.30 | 0.54 | 0.43 | 0.45 | 0.43 | 0.58 | 0.41 | 0.45 | 0.33 |
| own.pigs | 0.42 | 0.37 | 0.32 | 0.32 | 0.27 | 0.33 | 0.39 | 0.36 | 0.35 | 0.37 | 0.17 | 0.38 | 0.36 | 0.40 |
| own.sheep | 0.13 | 0.09 | 0.14 | 0.17 | 0.11 | 0.08 | 0.29 | 0.14 | 0.10 | 0.13 | 0.00 | 0.03 | 0.09 | 0.07 |
| black_soil | 0.54 | 0.54 | 0.55 | 0.56 | 0.49 | 0.43 | 0.54 | 0.45 | 0.55 | 0.50 | 0.25 | 0.62 | 0.45 | 0.62 |
| red_soil | 0.30 | 0.25 | 0.29 | 0.25 | 0.29 | 0.37 | 0.25 | 0.30 | 0.30 | 0.33 | 0.42 | 0.23 | 0.36 | 0.26 |
| sandy_soil | 0.13 | 0.19 | 0.14 | 0.15 | 0.18 | 0.18 | 0.07 | 0.18 | 0.05 | 0.07 | 0.17 | 0.13 | 0.18 | 0.10 |
We’re defining Tubura tenure as having 3 or more seasons of experience farming with Tubura. We draw the line at 3 seasons as three seasons of fertilizer use is approximately when we’d expect fertilizer to start to have a detrimental effect on soil health.
oafOnly$tenured <- ifelse(oafOnly$total.seasons>=3,1,0)
tenure <- do.call(rbind, lapply(out.list, function(x) {
out <- t.test(oafOnly[,x] ~ oafOnly[,"tenured"], data=oafOnly)
tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
tab[,1:2] <- round(tab[,1:2],3)
names(tab) <- c(names(out[[5]]), "pvalue")
return(tab)
}))
# use p.adjust with bonferroni correction
tenure$pvalue <- p.adjust(tenure$pvalue, method="fdr")
rownames(tenure) <- out.list
tenure <- tenure[order(tenure$pvalue),]
tenure$pvalue <- ifelse(tenure[, 3] < 0.001, "< 0.001", round(tenure[, 3], 3))
colnames(tenure) <- c("Non-Tubura", "Tubura Client", "p-value")
print(kable(tenure))
| Non-Tubura | Tubura Client | p-value | |
|---|---|---|---|
| n_season_fert | 2.109 | 4.762 | < 0.001 |
| n_season_compost | 5.344 | 6.890 | < 0.001 |
| climbing_bean | 0.161 | 0.295 | < 0.001 |
| slope | 8.850 | 11.166 | < 0.001 |
| age | 42.675 | 46.039 | 0.001 |
| hillside | 0.617 | 0.735 | 0.001 |
| sorghum | 0.172 | 0.086 | 0.001 |
| n_seasons_leg_1 | 2.022 | 2.579 | 0.005 |
| valley | 0.320 | 0.226 | 0.008 |
| cows | 0.986 | 1.356 | 0.011 |
| hhsize | 5.205 | 5.599 | 0.012 |
| female | 0.459 | 0.551 | 0.018 |
| alt | 1509.106 | 1610.665 | 0.021 |
| Total.N | 0.155 | 0.149 | 0.059 |
| own.cows | 0.656 | 0.717 | 0.123 |
| n_season_fallow | 0.557 | 0.733 | 0.222 |
| field.size | 609.258 | 537.968 | 0.224 |
| pH | 5.485 | 5.420 | 0.224 |
| own.pigs | 0.388 | 0.336 | 0.224 |
| hilltop | 0.063 | 0.039 | 0.233 |
| ExAl | 0.585 | 0.645 | 0.235 |
| m3.Mg | 204.127 | 213.884 | 0.412 |
| m3.Ca | 815.244 | 767.659 | 0.421 |
| sheep | 0.205 | 0.262 | 0.474 |
| own.chickens | 0.328 | 0.360 | 0.474 |
| sandy_soil | 0.167 | 0.142 | 0.474 |
| own.sheep | 0.104 | 0.123 | 0.511 |
| Total.C | 2.090 | 2.065 | 0.574 |
| red_soil | 0.270 | 0.295 | 0.574 |
| chickens | 1.145 | 1.259 | 0.605 |
| n_season_lime | 0.227 | 0.259 | 0.616 |
| maize | 0.066 | 0.077 | 0.616 |
| black_soil | 0.538 | 0.519 | 0.662 |
| n_seasons_leg_2 | 2.475 | 2.569 | 0.743 |
| goats | 1.060 | 1.103 | 0.751 |
| own | 0.940 | 0.945 | 0.788 |
| bush_bean | 0.158 | 0.151 | 0.788 |
| pigs | 0.533 | 0.548 | 0.844 |
| own.goats | 0.492 | 0.488 | 0.91 |
Demographic variables We are well balanced along demographic variables.
Agricultural practice variables Not surprisingly, Tubura farmers have more cumulative years of fertilizer use than current non-Tubura farmers. While that difference is signficant, it is realistically only a single season of fertilizer use different.
Interestingly, non-Tubura farmers reported using more lime than current Tubura farmers. This
Soil Variables Soil pH, calcium and magnesium levels are lower for tenured Tubura farmers. This is consistent with the hypothesis that increaesd fertilizer use leads to an increaese in soil acidity.
Here’s where we’ll look at the contribution of fertilizer, lime and cultivation practices on soil health outcomes. This analysis will be come richer as we gain longitudinal measures. I caution that we cannot treat these relationships as causal. The direction of causality is not clearly delineated in the data or the study design. However, we can identify meaningful connections between practices and outcomes through this analysis to generate new hypotheses for field testing.
I’m going to start with behaviors by sections and then move to a more comprehensive model including multiple practices. All models will include controls for site to account for local variation and field officer behavior.
Check for multicollinearity before adding number of seasons of agronomic inputs on the same side of the regression.
suppressMessages(library(stargazer))
inputUse <- c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow")
cor(d[,inputUse], use="complete.obs")
n_season_fert n_season_compost n_season_lime n_season_fallow
n_season_fert 1.0000000 0.35575631 0.17038250 -0.06419940
n_season_compost 0.3557563 1.00000000 0.03734895 -0.16878470
n_season_lime 0.1703825 0.03734895 1.00000000 -0.01112774
n_season_fallow -0.0641994 -0.16878470 -0.01112774 1.00000000
Interpretation: The strongest correlation between the input use intensity variables is between seasons of fertilizer and compost use, ~0.35. While this is on the higher end it’s not necessarily cause for alarm.
inputUse <- paste(c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow"), collapse= " + ")
list1 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", inputUse, "+ as.factor(cell)", sep="")), data=d)
return(mod)
})
stargazer(list1, type="html",
title = "2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models",
covariate.labels = c("Seasons of Fertilizer", "Seasons of Compost",
"Seasons of Lime", "Seasons of Fallow"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("m3.","", soilVars)),
notes = "Includes FE for cell",
omit=c("cell"), out=paste("output", "rw_baseline_agprac.htm", sep="/"))
| Ca | Mg | pH | Total.N | Total.C | ExAl | |
| (1) | (2) | (3) | (4) | (5) | (6) | |
| Seasons of Fertilizer | -3.601 | -0.138 | -0.002 | -0.0001 | -0.002 | 0.0003 |
| (4.042) | (0.776) | (0.004) | (0.0002) | (0.003) | (0.003) | |
| Seasons of Compost | 0.761 | -0.007 | 0.004 | 0.00000 | 0.002 | -0.005* |
| (3.270) | (0.628) | (0.003) | (0.0002) | (0.003) | (0.003) | |
| Seasons of Lime | 42.291** | 7.921** | 0.011 | 0.002** | 0.017 | 0.009 |
| (17.647) | (3.389) | (0.016) | (0.001) | (0.014) | (0.015) | |
| Seasons of Fallow | -32.572*** | -3.291*** | -0.038*** | 0.001 | 0.007 | 0.025*** |
| (6.433) | (1.235) | (0.006) | (0.0004) | (0.005) | (0.005) | |
| Constant | 439.964 | 88.243 | 5.164*** | 0.210*** | 3.436*** | 1.762*** |
| (472.287) | (90.710) | (0.420) | (0.026) | (0.370) | (0.403) | |
| Observations | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 |
| R2 | 0.507 | 0.497 | 0.574 | 0.503 | 0.439 | 0.561 |
| Adjusted R2 | 0.462 | 0.451 | 0.535 | 0.458 | 0.388 | 0.521 |
| Residual Std. Error (df = 2234) | 472.106 | 90.675 | 0.420 | 0.026 | 0.369 | 0.403 |
| F Statistic (df = 204; 2234) | 11.266*** | 10.817*** | 14.741*** | 11.083*** | 8.561*** | 13.994*** |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
| Includes FE for cell | ||||||
Interpretation The naive model suggests that when we include site level fixed effects, duration of agronomic practices don’t have a big effect on soil health outcomes. However, some of the practice intensity variables are not well distributed. Let’s take a look at a log transformation. I’m adding one to the variables as to not end up with lots of Inf values.
Log transformations in theory are appropriate for variables that are right skewed (vavlues clustered to the left of the distribution) and see diminishing returns to increasing values. The shape of the data suggests a log transformation but it’s debateable whether the relationship is diminishing.
agPrac <- c(names(d[grep('n_season_', names(d))]))
for(i in 1:length(agPrac)){
print(
ggplot(d, aes(x=d[,agPrac[i]])) + geom_density() +
labs(x = paste(agPrac[i], " No transform", sep = ""))
)
}
# since these are all skewed, consider a log transform
for(i in 1:length(agPrac)){
print(
ggplot(d, aes(x=log10(d[,agPrac[i]]+1))) + geom_density() +
labs(x = paste(agPrac[i], " Log transform", sep = ""))
)
}
# look at other transfomations
for(i in 1:length(agPrac)){
print(
ggplot(d, aes(x=d[,agPrac[i]]^(1/3))) + geom_density() +
labs(x = paste(agPrac[i], " cubic root transform", sep = ""))
)
}
# visualize the outcomes as well to see if a transformation is warranted
for(i in 1:length(soilVars)){
print(
ggplot(d, aes(x=d[,soilVars[i]])) + geom_density() +
labs(x = soilVars[i], title = soilVars[i])
)
}
d$logFert <- log(d$n_season_fert+1)
d$logCompost <- log(d$n_season_compost+1)
d$logLime <- log(d$n_season_lime+1)
d$logFallow <- log(d$n_season_fallow+1)
Or look at BoxCox graph to empirically determine the right transformation. Log is assuming a diminishing return to an increasing X. That’s probably not the case with fertilizer. We’d actually expect an increasing return as values get larger. We use boxcox to see what the data suggest. We interpret it as follows:
library(MASS)
for(i in 1:length(agPrac)){
boxcox(lm(pH ~ d[,agPrac[i]], data=d))
}
For pH at least it seems like a log transform is appropriate. We can run this for all other variables as well to see what we get as well.
Let’s look at the log results: See here and here for guidance on intepreting log transformed right hand side variables. See here for additional guidance on choosing a transformation.
How to interpret RHS log transform: For a linear multivariate OLS regression, we say “a one unit increase in X causes a (coefficient) change in Y.” For a linear-log regression where the X variable is log transformed, we say a L percent change in X leads to a (coefficient*L) change in Y.
Question: Do we want to run this analysis for only OAF farmers? If so, adjust the sample accordingly.
logVars <- paste(names(d[grep("log", names(d))]), collapse=" + ")
list2 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", logVars, "+ as.factor(cell)", sep="")), data=d)
return(mod)
})
list2b <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", logVars, sep="")), data=d)
return(mod)
})
suppressWarnings(
stargazer(list2, list2b, type="html",
title = "2016A Rwanda Soil Health Baseline - Log Agronomic Practice Models",
covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)"),
column.labels = c(rep(gsub("m3.","", soilVars),2)),
dep.var.caption = "",
dep.var.labels = c("",""),
add.lines = list(c("Cell FE?", rep("Yes", 5), rep("No", 5))),
notes = "Includes FE for cell",
omit=c("cell"), out=paste("output", "rw_baseline_agprac_log.htm", sep="/"))
)
plm.log <- function(x, range){
beta = round(summary(x)$coefficients[range,1],3)
beta.pval = summary(x)$coefficients[range,4]
beta.conv = ifelse(beta.pval < 0.01, "***", ifelse(
beta.pval < 0.05, "**", ifelse(
beta.pval < 0.1, "*", "")))
#beta.pval = round(beta.pval, 3)
outcome = paste(beta, beta.conv, sep = "")
outcome = c(outcome, unique(round(summary(x)$coefficients[1,1],3)))
res = data.frame(outcome, stringsAsFactors = F)
return(res)
}
rw.table15 <- do.call(cbind, lapply(list2, function(x){
plm.log(x, 2:5)
}))
colnames(rw.table15) <- soilVars
rownames(rw.table15) <- c(names(d)[grep("log", names(d))], "constant")
rw.table15 <- rw.table15[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(rw.table15, file=paste("output", "rwtable15_reg.csv", sep = "/"),
row.names = T)
# a 10 percent increase in x leads to B1*.1 change in Y
logTrans <- do.call(cbind, lapply(list2, function(x){
coeff = x$coefficients[2:5]
tenPercent = round(coeff*.1, 5)
names(tenPercent) <- paste("10% increase in ", gsub("log","", names(tenPercent)), " leads to:", sep="")
return(tenPercent)
}))
colnames(logTrans) <- soilVars
print(kable(logTrans))
| m3.Ca | m3.Mg | pH | Total.N | Total.C | ExAl | |
|---|---|---|---|---|---|---|
| 10% increase in Fert leads to: | -1.78758 | -0.11370 | -0.00150 | -0.00004 | -0.00089 | 0.00048 |
| 10% increase in Compost leads to: | 0.22044 | -0.02094 | 0.00129 | -0.00002 | 0.00061 | -0.00176 |
| 10% increase in Lime leads to: | 14.37415 | 2.65234 | 0.00599 | 0.00074 | 0.00658 | 0.00030 |
| 10% increase in Fallow leads to: | -11.54532 | -1.13393 | -0.01315 | 0.00013 | 0.00177 | 0.00873 |
Interpretation: When we transform the variables to log, the data starts to tell a more coherent story, at least directionally. If we remove the district FE, the coefficients gain significance.
Let’s look first at a naive model of One Acre Fund tenure on soil health. Remember: these data are not longitudinal! These data are not longitudinal and reflect farmer selection into One Acre Fund. While these models will try to be both robust and parsimonious, we will inevitabily suffer omitted variable bias due to a lack of an instrument.
list3 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", "total.seasons + as.factor(cell)", sep="")), data=d)
return(mod)
})
stargazer(list3, type="html",
title = "2016A Rwanda Soil Health Baseline - Naive Tenure Models",
covariate.labels = c("OAF Tenure"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("m3.","", soilVars)),
notes = "Includes FE for cell",
omit=c("cell"), out=paste("output", "rw_baseline_tenure.htm", sep="/"))
| Ca | Mg | pH | Total.N | Total.C | ExAl | |
| (1) | (2) | (3) | (4) | (5) | (6) | |
| OAF Tenure | -0.129 | 0.745 | 0.001 | -0.0002 | -0.004 | -0.002 |
| (3.471) | (0.663) | (0.003) | (0.0002) | (0.003) | (0.003) | |
| Constant | 443.267 | 86.727 | 5.178*** | 0.210*** | 3.451*** | 1.746*** |
| (475.253) | (90.857) | (0.424) | (0.026) | (0.369) | (0.406) | |
| Observations | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 |
| R2 | 0.500 | 0.494 | 0.564 | 0.501 | 0.438 | 0.556 |
| Adjusted R2 | 0.455 | 0.449 | 0.525 | 0.457 | 0.388 | 0.516 |
| Residual Std. Error (df = 2237) | 475.202 | 90.847 | 0.424 | 0.026 | 0.369 | 0.405 |
| F Statistic (df = 201; 2237) | 11.127*** | 10.880*** | 14.408*** | 11.196*** | 8.684*** | 13.913*** |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
| Includes FE for cell | ||||||
Interpretation: The naive One Acre Fund tenure model suggest that across the board that additional years of 1AF practices have a negative effect on soil health parameters. Let’s combine 1AF tenure with the agronomic practices model above to build a more robust model:
list4 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", logVars, "+ total.seasons + as.factor(cell)", sep="")), data=d)
return(mod)
})
stargazer(list4, type="html",
title = "2016A Rwanda Soil Health Baseline - Ag Practice and Tenure",
covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)", "OAF Tenure"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("m3.","", soilVars)),
notes = "Includes FE for cell",
omit=c("cell"), out=paste("output", "rw_baseline_ag_tenure.htm", sep="/"))
| Ca | Mg | pH | Total.N | Total.C | ExAl | |
| (1) | (2) | (3) | (4) | (5) | (6) | |
| Seasons of Fertilizer (log) | -22.634 | -3.440 | -0.024 | 0.0003 | 0.002 | 0.014 |
| (16.929) | (3.258) | (0.015) | (0.001) | (0.013) | (0.014) | |
| Seasons of Compost (log) | 2.280 | -0.173 | 0.013 | -0.0003 | 0.006 | -0.018 |
| (14.833) | (2.854) | (0.013) | (0.001) | (0.012) | (0.013) | |
| Seasons of Lime (log) | 143.969*** | 26.634*** | 0.060* | 0.007*** | 0.065** | 0.003 |
| (39.808) | (7.660) | (0.035) | (0.002) | (0.031) | (0.034) | |
| Seasons of Fallow (log) | -115.872*** | -11.542*** | -0.132*** | 0.001 | 0.019 | 0.088*** |
| (18.857) | (3.629) | (0.017) | (0.001) | (0.015) | (0.016) | |
| OAF Tenure | 2.125 | 1.029 | 0.004 | -0.0003 | -0.005 | -0.004 |
| (4.209) | (0.810) | (0.004) | (0.0002) | (0.003) | (0.004) | |
| Constant | 435.090 | 86.437 | 5.151*** | 0.211*** | 3.444*** | 1.779*** |
| (470.889) | (90.611) | (0.418) | (0.026) | (0.370) | (0.403) | |
| Observations | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 |
| R2 | 0.511 | 0.499 | 0.578 | 0.505 | 0.440 | 0.562 |
| Adjusted R2 | 0.466 | 0.453 | 0.539 | 0.459 | 0.388 | 0.522 |
| Residual Std. Error (df = 2233) | 470.204 | 90.479 | 0.418 | 0.026 | 0.369 | 0.403 |
| F Statistic (df = 205; 2233) | 11.396*** | 10.863*** | 14.891*** | 11.092*** | 8.556*** | 14.004*** |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
| Includes FE for cell | ||||||
Interpretation: Including agronomic practices and 1AF tenure in the same model dampens the magnitude, but not the significance, of 1AF tenure on soil health outcomes.
Thus far we have looked at aggregated historical plot level practices and their effect on soil health. We also asked farmers about their cultivation practices on their plot in the previous season, 15B. We have more precise information for fertilizer, compost and liming practices for the 15B season.
# scale all the field application variables to ares
d$ares <- d$field.size/100
d$fert1.are <- d$field_kg_fert1_15b/d$ares
d$fert2.are <- d$field_kg_fert2_15b/d$ares
d$fert.total.are <- apply(d[,c("fert1.are", "fert2.are")], 1, function(x){
sum(x, na.rm=T)
})
d$fert.total.are <- ifelse(is.na(d$fert1.are) & is.na(d$fert2.are), NA, d$fert.total.are)
d$compost.are <- d$field_kg_compost_15b/d$ares
d$lime.are <- d$kg_lime_15b/d$ares
intensityVars <- c("fert1.are", "fert2.are",
"compost.are", "lime.are")
cor(d[,intensityVars], use="complete.obs")
fert1.are fert2.are compost.are lime.are
fert1.are 1.0000000 0.9889407 0.5216134 0.8679373
fert2.are 0.9889407 1.0000000 0.5328874 0.8539114
compost.are 0.5216134 0.5328874 1.0000000 0.6236616
lime.are 0.8679373 0.8539114 0.6236616 1.0000000
#table(d$field_15b_fert_t, useNA = 'ifany')
d$field_15b_fert_t <- tolower(d$field_15b_fert_t)
names(d)[names(d)=="v69"] <- "field_15b_fert_t2"
#table(d$field_15b_fert_t2)
d$field_15b_fert_t2 <- tolower(d$field_15b_fert_t2)
d$dap1 <- ifelse(d$field_15b_fert_t=="dap", d$field_kg_fert1_15b, NA)
d$dap1.are <- d$dap1/d$ares
d$npk1 <- ifelse(grepl("npk", d$field_15b_fert_t), d$field_kg_fert1_15b, NA)
d$npk1.are <- d$npk1/d$ares
d$urea1 <- ifelse(d$field_15b_fert_t=="urea", d$field_kg_fert1_15b, NA)
d$urea1.are <- d$urea1/d$ares
d$urea2 <- ifelse(d$field_15b_fert_t2=="urea", d$field_kg_fert2_15b, NA)
d$urea2.are <- d$urea2/d$ares
d$total.urea.are <- apply(d[,c("urea1.are", "urea2.are")], 1, function(x){
sum(x, na.rm=T)
})
d$total.urea.are <- ifelse(is.na(d$urea1.are) & is.na(d$urea2.are), NA, d$total.urea.are)
for(i in 1:length(intensityVars)){
print(
ggplot(d, aes(x=d[,intensityVars[i]])) + geom_density() +
labs(x = intensityVars[i], title = intensityVars[i])
)
}
Conclusion - Take 1: The application rate per are variables are weird. I think it’s because of the field dimensions. I’m going to go back to the field dimensions and check this.
Let’s look at the dimensions of the fields that have large application rates
d[d$fert1.are>10 & !is.na(d$fert1.are), c("field_dim1", "field_dim2", "field.size", "fert1.are")]
d[d$fert2.are>10 & !is.na(d$fert2.are), c("field_dim1", "field_dim2", "field.size", "fert2.are")]
d[d$compost.are>500 & !is.na(d$compost.are), c("field_dim1", "field_dim2", "field.size", "compost.are")]
# there's a field that is 1 meter wide? Surely not.
d[abs(d$lime.are)>40 & !is.na(d$lime.are), c("field_dim1", "field_dim2", "field.size", "lime.are")]
# how is there a negative quantity of lime?
Generate tenure vs. pH and tenure vs. fertilizer seasons scatter plots:
tenure.ph <- ggplot(d, aes(x=jitter(total.seasons), y=pH)) + geom_point() +
geom_smooth(method="loess") +
labs(x = "OAF Tenure", y = "Soil pH", title = "OAF Tenure vs. soil pH in Rwanda")
tenure.ph
pdf(paste("output", "rw tenure vs ph.pdf", sep = "/"), width=11, height=8.5)
print(tenure.ph)
dev.off()
quartz_off_screen
2
tenure.fertilizer <- ggplot(d, aes(x= jitter(total.seasons), y=jitter(n_season_fert))) + geom_point() +
geom_smooth(method='loess') +
labs(x = "OAF Tenure", y = "Years of Fertilizer in past 5 years", title = "OAF Tenure vs. soil pH in Rwanda")
tenure.fertilizer
pdf(paste("output", "rw tenure vs fertilizer.pdf", sep = "/"), width=11, height=8.5)
print(tenure.fertilizer)
dev.off()
quartz_off_screen
2
# this should be kg of fertilizer used in this field. Compost is off the charts. Convert this to compost per sq meter
plm.t16 <- function(x, range){
beta = round(summary(x)$coefficients[range,1],3)
beta.pval = round(summary(x)$coefficients[range,4],3)
beta.conv = ifelse(beta.pval < 0.01, "***", ifelse(
beta.pval < 0.05, "**", ifelse(
beta.pval < 0.1, "*", "")))
#beta.pval = round(beta.pval, 3)
outcome = paste(beta, " (", beta.pval, ")", sep = "")
outcome = c(outcome, unique(round(summary(x)$coefficients[1,1],3)))
res = data.frame(outcome, stringsAsFactors = F)
return(res)
}
previousSeasonfert <- paste("fert1.are", "as.factor(cell)", sep=" + ")
list5 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonfert, sep="")), data=d)
return(mod)
})
table16 <- do.call(cbind, lapply(list5, function(x){
plm.t16(x, 2)
}))
colnames(table16) <- soilVars
rownames(table16) <- c("Fertilizer (kg/are)", "constant")
table16 <- table16[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16, file=paste("output", "rwtable16_fert.csv", sep = "/"),
row.names = T)
previousSeasonfertAll <- paste("fert.total.are", "as.factor(cell)", sep=" + ")
list5a <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonfertAll, sep="")), data=d)
return(mod)
})
table16a <- do.call(cbind, lapply(list5a, function(x){
plm.t16(x, 2)
}))
colnames(table16a) <- soilVars
rownames(table16a) <- c("Fertilizer (all) (kg/are)", "constant")
table16a <- table16a[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16a, file=paste("output", "rwtable16_fert_all.csv", sep = "/"),
row.names = T)
# these objects have the same name as the fertilizer objects for simplicity. Run
# from the top to avoid overwriting issues.
previousSeasoncompost <- paste("compost.are", "as.factor(cell)", sep=" + ")
temp <- subset(d, d$compost.are>0)
list5b <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("temp[,x] ~", previousSeasoncompost, sep="")), data=temp)
return(mod)
})
table16b <- do.call(cbind, lapply(list5b, function(x){
plm.t16(x, 2)
}))
colnames(table16b) <- soilVars
rownames(table16b) <- c("Compost (kg/are)", "constant")
table16b <- table16b[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16b, file=paste("output", "rwtable16_compost.csv", sep = "/"),
row.names = T)
The compost regression results are suprising. Look at Carbon and compost scatter plot. If we include 0 compost, we see no relationship. If we exclude 0 compost, we
soilC.compost <- ggplot(temp, aes(x = compost.are, y=Total.C)) + geom_point() +
scale_x_continuous(limits=c(0,50)) + # remove larger values from the graph.
geom_smooth(method='lm') +
labs(title = "Compost and Soil Carbon in Rwanda", x = "Compost (kg/acre)", y = "Total Carbon")
soilC.compost
pdf(paste("output", "compost vs soil c.pdf", sep = "/"), width=11, height=8.5)
print(soilC.compost)
dev.off()
quartz_off_screen
2
There are too many zero or near zero values. We can conclude that perhaps we’re not asking this question well. Even when we remove non-zero values, there’s little discernable relationship.
# these objects have the same name as the fertilizer objects for simplicity. Run
# from the top to avoid overwriting issues.
previousSeasonlime <- paste("lime.are", "as.factor(cell)", sep=" + ")
list5c <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonlime, sep="")), data=d)
return(mod)
})
table16c <- do.call(cbind, lapply(list5c, function(x){
plm.t16(x, 2)
}))
colnames(table16c) <- soilVars
rownames(table16c) <- c("Lime (kg/are)", "constant")
table16c <- table16c[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16c, file=paste("output", "rwtable16_lime.csv", sep = "/"),
row.names = T)
previousSeasonDAP <- paste("dap1.are", "as.factor(cell)", sep=" + ")
list5d <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonDAP, sep="")), data=d)
return(mod)
})
table16d <- do.call(cbind, lapply(list5d, function(x){
plm.t16(x, 2)
}))
colnames(table16d) <- soilVars
rownames(table16d) <- c("DAP (kg/are)", "constant")
table16d <- table16d[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16d, file=paste("output", "rwtable16_dap.csv", sep = "/"),
row.names = T)
previousSeasonNPK <- paste("npk1.are", "as.factor(cell)", sep=" + ")
list5e <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonNPK, sep="")), data=d)
return(mod)
})
table16e <- do.call(cbind, lapply(list5e, function(x){
plm.t16(x, 2)
}))
colnames(table16e) <- soilVars
rownames(table16e) <- c("NPK (kg/are)", "constant")
table16e <- table16e[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16e, file=paste("output", "rwtable16_npk.csv", sep = "/"),
row.names = T)
previousSeasonUrea <- paste("total.urea.are", "as.factor(cell)", sep=" + ")
list5f <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", previousSeasonUrea, sep="")), data=d)
return(mod)
})
table16f <- do.call(cbind, lapply(list5f, function(x){
plm.t16(x, 2)
}))
colnames(table16f) <- soilVars
rownames(table16f) <- c("Urea (kg/are)", "constant")
table16f <- table16f[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16f, file=paste("output", "rwtable16_urea.csv", sep = "/"),
row.names = T)
stargazer(list5, type="html",
title = "2016A Rwanda Soil Health Baseline - 15B practices",
covariate.labels = c("Fertilizer Rate (log)"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("m3.","", soilVars)),
notes = "Includes FE for cell",
omit=c("cell"), out=paste("output", "rw_baseline_15b_ag.htm", sep="/"))
Let’s look at farmer perceived fertility as a predictor of soil health. We’ll set ‘same fertility’ as the reference category.
d$fertility_qual <- relevel(as.factor(d$general_field_infocompare_fertil), ref="same")
list6 <- lapply(soilVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
return(mod)
})
stargazer(list6, type="html",
title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility",
covariate.labels = c("Farmer Opinion - Less Fertile",
"Farmer Opinion - More Fertile"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("m3.","", soilVars)),
notes = "Reference category is same fertility as other fields. Includes FE for cell",
notes.align = "l",
omit=c("cell"), out=paste("output", "rw_baseline_fertility_qual.htm", sep="/"))
| Ca | Mg | pH | Total.N | Total.C | ExAl | |
| (1) | (2) | (3) | (4) | (5) | (6) | |
| Farmer Opinion - Less Fertile | -128.212*** | -17.537*** | -0.134*** | 0.002 | 0.013 | 0.116*** |
| (26.465) | (5.077) | (0.024) | (0.001) | (0.021) | (0.023) | |
| Farmer Opinion - More Fertile | 22.239 | 1.717 | 0.014 | -0.001 | -0.009 | -0.017 |
| (29.278) | (5.617) | (0.026) | (0.002) | (0.023) | (0.025) | |
| Constant | 571.222 | 105.753 | 5.314*** | 0.208*** | 3.430*** | 1.626*** |
| (473.062) | (90.758) | (0.421) | (0.026) | (0.370) | (0.403) | |
| Observations | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 | 2,439 |
| R2 | 0.506 | 0.497 | 0.571 | 0.502 | 0.438 | 0.562 |
| Adjusted R2 | 0.462 | 0.452 | 0.533 | 0.457 | 0.387 | 0.522 |
| Residual Std. Error (df = 2236) | 472.321 | 90.616 | 0.421 | 0.026 | 0.370 | 0.403 |
| F Statistic (df = 202; 2236) | 11.348*** | 10.943*** | 14.756*** | 11.160*** | 8.628*** | 14.179*** |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
| Reference category is same fertility as other fields. Includes FE for cell | ||||||
Interpretation: Farmers understand their fields well. Their categorization of which field are more and less fertile corresponds to our quantified measures of soil health. The only features farmers don’t seem to get correct are nitrogen and carbon. The nitrogen and carbon levels are indistinguishable in ‘low fertility’ fields relative to the fields deemed to be the ‘same fertility.’ Reminder: We need to remember that farmers are only evaluting one of their fields thus we are not able to account for the quality of the farmer in assessing his/her fields.
# merge wetChem in with d
names(wetChem)[2:21] <- paste("wet.", names(wetChem)[2:21], sep = "")
d <- left_join(d, wetChem, by="SSN")
joining factor and character vector, coercing into character vector
ggplot(d, aes(x=wet.C.E.C)) + geom_density() +
geom_vline(xintercept = median(d$wet.C.E.C, na.rm=T), color="red") +
geom_vline(xintercept = mean(d$wet.C.E.C, na.rm=T), color="blue")
summary(d$wet.C.E.C)[3:4]
Median Mean
8.620 9.684
wetVars <- names(d)[grep("wet.", names(d))]
for(i in 1:length(wetVars)){
print(
ggplot(data=d, aes(x=as.factor(client), y=d[,wetVars[i]])) +
geom_boxplot() +
labs(x="Tubura Farmer", y=wetVars[i], title = paste("RW baseline wet chem - ", wetVars[i], sep = ""))
)
}
list7 <- lapply(wetVars, function(x){
mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
return(mod)
})
suppressWarnings(
stargazer(list7, type="html",
title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
covariate.labels = c("Farmer Opinion - Less Fertile",
"Farmer Opinion - More Fertile"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("wet.","", wetVars)),
notes = "Reference category is same fertility as other fields. Includes FE for cell",
notes.align = "l",
omit=c("cell"), out=paste("output", "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
| pH | EC..S. | P | K | Ca | Mg | S | Na | Fe | Mn | B | Cu | Zn | C.E.C | TN | C.N | Exch..Acidity | Acid.Saturation | Exch.Al | OC | |
| (1) | (2) | (3) | (4) | (5) | (6) | (7) | (8) | (9) | (10) | (11) | (12) | (13) | (14) | (15) | (16) | (17) | (18) | (19) | (20) | |
| Farmer Opinion - Less Fertile | -0.228 | -5.566 | -8.792 | -55.794* | -74.347 | -35.860 | 2.483 | -1.881 | 19.090 | -33.558 | -0.075 | -0.424 | -0.334 | -0.499 | 0.001 | -0.201 | 0.385** | 10.942*** | 0.302** | 0.063 |
| (0.149) | (11.230) | (19.805) | (28.523) | (150.909) | (36.429) | (2.272) | (2.951) | (17.447) | (21.295) | (0.225) | (0.339) | (0.772) | (1.119) | (0.008) | (0.632) | (0.163) | (3.777) | (0.126) | (0.137) | |
| Farmer Opinion - More Fertile | -0.112 | -11.733 | -11.765 | -80.863** | -174.963 | -71.564 | 2.687 | -1.391 | -13.159 | -11.052 | -0.061 | -0.250 | -0.880 | -1.523 | -0.023** | 0.318 | 0.157 | 4.668 | 0.119 | -0.277 |
| (0.186) | (14.042) | (25.228) | (36.333) | (192.230) | (46.404) | (2.895) | (3.759) | (22.224) | (27.126) | (0.286) | (0.432) | (0.983) | (1.425) | (0.010) | (0.806) | (0.208) | (4.811) | (0.161) | (0.174) | |
| Constant | 6.225*** | 127.000*** | 8.910 | 213.000*** | 1,745.000*** | 435.000*** | 15.700** | 24.300*** | 61.450 | 194.000*** | 0.415 | 2.670*** | 4.415** | 15.650*** | 0.195*** | 11.750*** | 0.140 | 1.100 | 0.023 | 2.335*** |
| (0.409) | (30.862) | (55.804) | (80.369) | (425.217) | (102.647) | (6.403) | (8.315) | (49.160) | (60.004) | (0.633) | (0.957) | (2.175) | (3.152) | (0.022) | (1.782) | (0.459) | (10.642) | (0.356) | (0.385) | |
| Observations | 234 | 234 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 |
| R2 | 0.735 | 0.594 | 0.705 | 0.661 | 0.750 | 0.761 | 0.727 | 0.706 | 0.762 | 0.695 | 0.525 | 0.695 | 0.540 | 0.751 | 0.792 | 0.670 | 0.769 | 0.769 | 0.793 | 0.733 |
| Adjusted R2 | 0.407 | 0.091 | 0.350 | 0.254 | 0.450 | 0.473 | 0.398 | 0.353 | 0.476 | 0.330 | -0.046 | 0.328 | -0.013 | 0.451 | 0.541 | 0.274 | 0.491 | 0.492 | 0.544 | 0.412 |
| Residual Std. Error | 0.579 (df = 104) | 43.645 (df = 104) | 78.919 (df = 109) | 113.659 (df = 109) | 601.348 (df = 109) | 145.165 (df = 109) | 9.055 (df = 109) | 11.760 (df = 109) | 69.523 (df = 109) | 84.858 (df = 109) | 0.895 (df = 109) | 1.353 (df = 109) | 3.077 (df = 109) | 4.458 (df = 109) | 0.032 (df = 109) | 2.520 (df = 109) | 0.649 (df = 109) | 15.049 (df = 109) | 0.503 (df = 109) | 0.544 (df = 109) |
| F Statistic | 2.238*** (df = 129; 104) | 1.180 (df = 129; 104) | 1.985*** (df = 131; 109) | 1.625*** (df = 131; 109) | 2.498*** (df = 131; 109) | 2.642*** (df = 131; 109) | 2.214*** (df = 131; 109) | 2.001*** (df = 131; 109) | 2.661*** (df = 131; 109) | 1.900*** (df = 131; 109) | 0.920 (df = 131; 109) | 1.894*** (df = 131; 109) | 0.976 (df = 131; 109) | 2.503*** (df = 131; 109) | 3.160*** (df = 131; 109) | 1.691*** (df = 131; 109) | 2.765*** (df = 131; 109) | 2.772*** (df = 131; 109) | 3.189*** (df = 131; 109) | 2.283*** (df = 131; 109) |
| Note: | p<0.1; p<0.05; p<0.01 | |||||||||||||||||||
| Reference category is same fertility as other fields. Includes FE for cell | ||||||||||||||||||||
Interpretation: Our sample size decreases considerably when we look only at wet chemistry results. Thus, we do not see the same signficant relationships we saw between farmer perceived fertility and soil characteristics we saw when looking at the predicted values.
pca1 <- prcomp(d[,soilVars], scale=TRUE, center=TRUE)
print(pca1)
Standard deviations:
[1] 1.7694781 1.4197211 0.5985909 0.4836832 0.4092214 0.3059685
Rotation:
PC1 PC2 PC3 PC4 PC5 PC6
m3.Ca 0.497746227 -0.233189315 0.0200217427 -0.5908066 -0.05845757 0.58736772
m3.Mg 0.477406109 -0.130704228 -0.8045486852 0.2566894 -0.09624119 -0.18041456
pH 0.531809652 0.007760003 0.3663922355 -0.2518775 0.19398670 -0.69411952
Total.N 0.005638237 -0.663812926 0.2859668476 0.2266671 -0.64277520 -0.11404215
Total.C -0.100238773 -0.658444962 -0.0007406172 0.1625259 0.72558685 0.05925313
ExAl -0.481072753 -0.232984039 -0.3691607865 -0.6662006 -0.10026596 -0.35232273
plot(pca1)
pca1$rotation
PC1 PC2 PC3 PC4 PC5 PC6
m3.Ca 0.497746227 -0.233189315 0.0200217427 -0.5908066 -0.05845757 0.58736772
m3.Mg 0.477406109 -0.130704228 -0.8045486852 0.2566894 -0.09624119 -0.18041456
pH 0.531809652 0.007760003 0.3663922355 -0.2518775 0.19398670 -0.69411952
Total.N 0.005638237 -0.663812926 0.2859668476 0.2266671 -0.64277520 -0.11404215
Total.C -0.100238773 -0.658444962 -0.0007406172 0.1625259 0.72558685 0.05925313
ExAl -0.481072753 -0.232984039 -0.3691607865 -0.6662006 -0.10026596 -0.35232273
The first principal component is composed primarily of soil pH related variables, Ca, Mg, and pH. The second capture N and C. These groupings (pH grouping and C and N) are not all that surprising given that our predicted soil variable set is fairly limited.
If the variables have the same sign that indicates that they are positively correlated in the principal component. In the first principal component, we see Ca, Mg and pH loading in the same direction. In the second principal component, we see N and C moving in the same direction. Ca and Mg are also associated in the same direction but to a lesser extent.
ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$fertility_qual)) + geom_point() +
labs(title = "PCA of soil attributes and farmer perceived soil fertility",
x = "First PCA", y= "Second PCA", color="Field Quality")
When we layer farmer perceived soil fertility on top of the principal component scatter plot, no clear pattern emergres. Fields with the same fertilitiy, less and more fertility are indistinguishable by the principal components. Thus, there is not a clear profile for farmer identified healthier soil or weaker soil based on principal components.
This section will build on the principal component work above and look at improving understanding of local context to inform local adaptation. Here we will also construct soil profiles to simplify the scaling of promising products and practices to targeted locations. We don’t have a full suite of predictors so we can’t look at a comprehensive soil profile.
ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$district)) + geom_point() +
labs(title = "PCA of soil attributes and district",
x = "First PCA", y= "Second PCA", color="District")
When we color the figure by district, we start to see a pattern emerge. However, there are too many points to clearly detect where all districts fall. Let’s instead look at the data by AEZ.
ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$aez)) + geom_point() +
labs(title = "PCA of soil attributes and AEZ",
x = "First PCA - Ca/Mg/pH", y= "Second PCA - N and C", color="AEZ")
Coloring the points by AEZ, we see a much clearer trend. The east is to the right of our graphic, then the central plateau followed by lake Kive and then Congo Nile in green on the left. What does this mean in terms of actual soil features? Let’s look at the figure but with the variable associations layered on top. See here for documentation
library(pca3d)
pca2d( pca1, biplot= TRUE, shape= 19, col= "black" )
It appears that the right side of the graph, the eastern AEZ, is associated with pH, Mg and Ca. This indicates that as the first principal component increases, so does the level of pH, Ca and Mg. Conversely, total N and C increase as the second principal component decreases. In terms of the AEZ figure above, this suggest that the Congo Nile AEZ has higher levels of N and C. Let’s test these hypotheses with a simple summary table:
print(kable(aggregate(d[,soilVars], by=list(d$aez), function(x){
round(mean(x, na.rm=T),5)
})))
| Group.1 | m3.Ca | m3.Mg | pH | Total.N | Total.C | ExAl |
|---|---|---|---|---|---|---|
| Central Plateau | 793.5114 | 207.0087 | 5.53865 | 0.13754 | 1.87063 | 0.43171 |
| Congo Nile | 396.7151 | 125.7663 | 4.96555 | 0.16072 | 2.30810 | 1.14672 |
| East | 1341.5413 | 274.0356 | 5.93730 | 0.17050 | 2.12779 | 0.23845 |
| Lake Kivu | 742.6964 | 229.2307 | 5.47636 | 0.13980 | 2.01218 | 0.56104 |
Confirmed! This likely also suggests that Congo Nile is at a higher altitude than the surrounding areas and that eastern Rwanda has relatively less weathered soils compared to western.
Consequence for trial placement: The Rwanda program is already blocking trials by AEZ but these data confirm that AEZ reflects meaningfu soil variation and thus captures key growing conditions for Rwandan farmers. Blocking trials by AEZ in Rwanda will enable us to evaluate trial hypotheses in more neutral pH ranges and higher N and C conditions.
wetVal <- d[complete.cases(d[,wetVars]),]
pca2 <- prcomp(wetVal[,c("wet.C.E.C", "wet.pH", "wet.Mg", "wet.Ca")], center=TRUE, scale=TRUE)
pca2 <- prcomp(wetVal[, wetVars], center=TRUE, scale=TRUE)
#plot(pca2)
ggplot(as.data.frame(pca2$x),aes(x=PC1,y=PC2, color=as.factor(wetVal$aez))) + geom_point() +
labs(title = "Wet Chem PCA with AEZ", x = "First PC", y="Second PC",
color="AEZ")
#pca2$rotation
pca2d(pca2, biplot= TRUE, shape= 19, col= "black")
# put pca2$x PC1 in the main data to run the fertility perception model.
#mod8 <- lm(
suppressWarnings(
stargazer(list7, type="html",
title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
covariate.labels = c("Farmer Opinion - Less Fertile",
"Farmer Opinion - More Fertile"),
dep.var.caption = "",
dep.var.labels = "",
column.labels = c(gsub("wet.","", wetVars)),
notes = "Reference category is same fertility as other fields. Includes FE for cell",
notes.align = "l",
omit=c("cell"), out=paste("output", "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
| pH | EC..S. | P | K | Ca | Mg | S | Na | Fe | Mn | B | Cu | Zn | C.E.C | TN | C.N | Exch..Acidity | Acid.Saturation | Exch.Al | OC | |
| (1) | (2) | (3) | (4) | (5) | (6) | (7) | (8) | (9) | (10) | (11) | (12) | (13) | (14) | (15) | (16) | (17) | (18) | (19) | (20) | |
| Farmer Opinion - Less Fertile | -0.228 | -5.566 | -8.792 | -55.794* | -74.347 | -35.860 | 2.483 | -1.881 | 19.090 | -33.558 | -0.075 | -0.424 | -0.334 | -0.499 | 0.001 | -0.201 | 0.385** | 10.942*** | 0.302** | 0.063 |
| (0.149) | (11.230) | (19.805) | (28.523) | (150.909) | (36.429) | (2.272) | (2.951) | (17.447) | (21.295) | (0.225) | (0.339) | (0.772) | (1.119) | (0.008) | (0.632) | (0.163) | (3.777) | (0.126) | (0.137) | |
| Farmer Opinion - More Fertile | -0.112 | -11.733 | -11.765 | -80.863** | -174.963 | -71.564 | 2.687 | -1.391 | -13.159 | -11.052 | -0.061 | -0.250 | -0.880 | -1.523 | -0.023** | 0.318 | 0.157 | 4.668 | 0.119 | -0.277 |
| (0.186) | (14.042) | (25.228) | (36.333) | (192.230) | (46.404) | (2.895) | (3.759) | (22.224) | (27.126) | (0.286) | (0.432) | (0.983) | (1.425) | (0.010) | (0.806) | (0.208) | (4.811) | (0.161) | (0.174) | |
| Constant | 6.225*** | 127.000*** | 8.910 | 213.000*** | 1,745.000*** | 435.000*** | 15.700** | 24.300*** | 61.450 | 194.000*** | 0.415 | 2.670*** | 4.415** | 15.650*** | 0.195*** | 11.750*** | 0.140 | 1.100 | 0.023 | 2.335*** |
| (0.409) | (30.862) | (55.804) | (80.369) | (425.217) | (102.647) | (6.403) | (8.315) | (49.160) | (60.004) | (0.633) | (0.957) | (2.175) | (3.152) | (0.022) | (1.782) | (0.459) | (10.642) | (0.356) | (0.385) | |
| Observations | 234 | 234 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 | 241 |
| R2 | 0.735 | 0.594 | 0.705 | 0.661 | 0.750 | 0.761 | 0.727 | 0.706 | 0.762 | 0.695 | 0.525 | 0.695 | 0.540 | 0.751 | 0.792 | 0.670 | 0.769 | 0.769 | 0.793 | 0.733 |
| Adjusted R2 | 0.407 | 0.091 | 0.350 | 0.254 | 0.450 | 0.473 | 0.398 | 0.353 | 0.476 | 0.330 | -0.046 | 0.328 | -0.013 | 0.451 | 0.541 | 0.274 | 0.491 | 0.492 | 0.544 | 0.412 |
| Residual Std. Error | 0.579 (df = 104) | 43.645 (df = 104) | 78.919 (df = 109) | 113.659 (df = 109) | 601.348 (df = 109) | 145.165 (df = 109) | 9.055 (df = 109) | 11.760 (df = 109) | 69.523 (df = 109) | 84.858 (df = 109) | 0.895 (df = 109) | 1.353 (df = 109) | 3.077 (df = 109) | 4.458 (df = 109) | 0.032 (df = 109) | 2.520 (df = 109) | 0.649 (df = 109) | 15.049 (df = 109) | 0.503 (df = 109) | 0.544 (df = 109) |
| F Statistic | 2.238*** (df = 129; 104) | 1.180 (df = 129; 104) | 1.985*** (df = 131; 109) | 1.625*** (df = 131; 109) | 2.498*** (df = 131; 109) | 2.642*** (df = 131; 109) | 2.214*** (df = 131; 109) | 2.001*** (df = 131; 109) | 2.661*** (df = 131; 109) | 1.900*** (df = 131; 109) | 0.920 (df = 131; 109) | 1.894*** (df = 131; 109) | 0.976 (df = 131; 109) | 2.503*** (df = 131; 109) | 3.160*** (df = 131; 109) | 1.691*** (df = 131; 109) | 2.765*** (df = 131; 109) | 2.772*** (df = 131; 109) | 3.189*** (df = 131; 109) | 2.283*** (df = 131; 109) |
| Note: | p<0.1; p<0.05; p<0.01 | |||||||||||||||||||
| Reference category is same fertility as other fields. Includes FE for cell | ||||||||||||||||||||
dist.sum <- aggregate(d[,out.list], by=list(d$district), function(x){
return(c(
paste(
round(mean(x, na.rm=T),3), " (", round(median(x, na.rm=T),3), ")",
" (", round(sd(x, na.rm=T),2), ")", sep = ""
)
))
})
write.csv(dist.sum, file=paste("output", "district covariate summary.csv", sep = "/"))
cell.sum <- aggregate(d[,out.list], by=list(d$cell), function(x){
return(c(
paste(
round(mean(x, na.rm=T),3), " (", round(median(x, na.rm=T),3), ")",
" (", round(sd(x, na.rm=T),2), ")", sep = ""
)
))
})
write.csv(cell.sum, file=paste("output", "cell covariate summary.csv", sep = "/"))
remove <- "female"
genderBalance <- out.list[!out.list %in% remove]
equal <- do.call(rbind, lapply(genderBalance, function(x){
out <- t.test(d[,x] ~ d[,"female"], data=d)
tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
tab[,1:2] <- round(tab[,1:2],3)
names(tab) <- c(names(out[[5]]), "pvalue")
#tab[,3] <- p.adjust(tab[,3], method="holm")
#tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
#print(tab)
return(tab)
}))
rownames(equal) <- NULL
# order variables
equal$pvalue <- p.adjust(equal$pvalue, method="fdr")
rownames(equal) <- genderBalance
equal <- equal[order(equal$pvalue),]
equal$pvalue <- ifelse(equal$pvalue < 0.001, "< 0.001", round(equal$pvalue,3))
colnames(equal) <- c("Male Farmers","Female Farmers", "p-value")
write.csv(equal, file=paste("output", "rw female farmer status.csv", sep = "/"), row.names=T)
write.csv(d, file=paste("output", "rwanda_shs_merged_clean.csv",sep = "/"), row.names = F)
We need to do a more rigorous job of accounting for differences between Tubura farmers and identified control farmers. Execute propensity score matching (PSM) to identify control farmers that overlap with Tubura farmers with regard to their likelihood of being a Tubura farmer.
psmVars <- paste(c("female", "age", "hhsize", "total.seasons",
"cows", "goats", "chickens", "pigs", "sheep"),
collapse=" + ")
reg <- glm(as.formula(paste("client ~", psmVars, sep="")),
family= binomial(link="logit"), data=d)
summary(reg)
Call:
glm(formula = as.formula(paste("client ~", psmVars, sep = "")),
family = binomial(link = "logit"), data = d)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.5170 -0.8509 0.0426 0.9319 2.0723
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.413913 0.202863 2.040 0.0413 *
female -0.744249 0.097219 -7.655 1.93e-14 ***
age -0.024071 0.003368 -7.147 8.90e-13 ***
hhsize 0.036271 0.022919 1.583 0.1135
total.seasons 0.519325 0.028386 18.295 < 2e-16 ***
cows 0.006420 0.017843 0.360 0.7190
goats -0.004032 0.029835 -0.135 0.8925
chickens 0.021716 0.017745 1.224 0.2211
pigs 0.051552 0.057656 0.894 0.3713
sheep 0.047237 0.069853 0.676 0.4989
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3381.1 on 2438 degrees of freedom
Residual deviance: 2573.8 on 2429 degrees of freedom
AIC: 2593.8
Number of Fisher Scoring iterations: 5
# summarize predicted probabilities
pr <- data.frame(pr_score = predict(reg, type='response'), treat = d$client)
# graph
psmGraph <- ggplot() + geom_histogram(data=subset(pr, pr$treat==1), aes(x = pr_score, y=..count.., fill=as.factor(treat)), bins=80, position = "identity") +
geom_histogram(data=subset(pr, pr$treat==0), aes(x=pr_score, y=-..count.., fill=as.factor(treat)), bins=80, position = "identity") +
scale_y_continuous(limits=c(-150,150)) +
labs(title ="PSM score overlap", x = "PSM score", y="Farmer count",
fill="Tubura/Control")
print(psmGraph)
pdf(file=paste("output", "rw_baseline_psm_overlap.pdf", sep="/"), height=8.5, width=11)
print(psmGraph)
dev.off()
quartz_off_screen
2
Interpretation We have some overlap but it’s clear that Tubura farmers occupy a different range than the identified control farmers. Let’s continue with the PSM matching process but restrict ourselves to Tubura and control farmers that meet a certain PSM matching radius.
We have to indicate a variable for matching. I’m choosing pH as we know it to be an issue in most of our operating areas and addressing soil acidity has numerous residual benefits to soil health. I’ll want to do this for all soil outcomes, however.
# PSM prep
tr <- cbind(d$client)
x <- d[, unlist(strsplit(psmVars, " + ", fixed=T))]
y <- soilVars
# PSM
set.seed(20161102)
m <- lapply(y, function(response){
suppressWarnings(
mod <- Match(Y = d[,response], Tr = tr, X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")
)
matchRes <- MatchBalance(tr ~ d[,response], match.out=mod, nboots=500, data=d, print.level = 0)
return(list(mod, matchRes))
})
#lapply(m, summary)
Now check the naive model approach for PSM balance.
matchRes <- do.call(rbind, lapply(1:length(m), function(model){
val <- as.data.frame(cbind(
standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff,
var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
return(val)
}))
rownames(matchRes) <- y
print(kable(matchRes))
| standard.diff | var.ratio | sdiff.adj | |
|---|---|---|---|
| m3.Ca | -2.9586010 | 0.8621133 | -0.0295860 |
| m3.Mg | -9.8036266 | 0.7586184 | -0.0980363 |
| pH | 0.9623464 | 0.9756746 | 0.0096235 |
| Total.N | 6.2212453 | 1.0317110 | 0.0622125 |
| Total.C | 0.7591351 | 1.0076355 | 0.0075914 |
| ExAl | -1.2303827 | 0.9764596 | -0.0123038 |
Interpretation: We want to see standard mean differences less than the absolute value of 0.25 (or 0.1 if we’re being conservative) and variance ratios close to 1 but certainly between 0.5 and 2.
According to the CRAN summary, sdiff is the standardized difference between the treatment and control units multiplied by 100. If I divide by 100, the values come much closer to reasonable value.
The common model approach doesn’t seem to be working for any of the variables. I’m going to rework the modeling approach so we can fit different models for each outcome upon which we’re trying to match.
d$age2 <- d$age^2
d$hhsize_age <- d$hhsize*d$age
d$hhsize2 <- d$hhsize^2
coreVars = c("female", "age", "hhsize", "own", "as.factor(cell)", "cows", "goats", "chickens", "pigs", "sheep", "black_soil", "red_soil", "sandy_soil")
psmList <- list(
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="m3.Ca"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="m3.Mg"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="pH"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="Total.N"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="Total.C"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="ExAl"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_fert"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_compost"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="n_seasons_leg_1"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_fallow"),
list(tr = "client",
psmVars = paste(coreVars,
collapse=" + "),
y="logFert"),
list(tr = "client",
psmVars = paste(c(coreVars, "age2",
"hhsize2"),
collapse=" + "),
y="n_seasons_leg_2"),
list(tr = "client",
psmVars = paste(c(coreVars, "age2",
"hhsize2"),
collapse=" + "),
y="n_season_lime"),
list(tr = "client",
psmVars = paste(c("female", "age", "hhsize", "own",
"cows", "goats", "chickens", "pigs", "sheep",
"as.factor(district)", "age2", "hhsize2",
"hhsize_age"),
collapse=" + "),
y="fert1.are"),
# list(tr = "client",
# psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
# "cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
# "hhsize_age"),
# collapse=" + "),
# y="fert2.are"),
list(tr = "client",
psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
"cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
"hhsize_age"),
collapse=" + "),
y="compost.are")
)
# fertilizer application in 15B has missing values at the cell level so i include
# a district level control instead. We don't get a good fit with the current model.
# adjust!
# aggregate(d$fert1.are, by=list(d$client, d$cell), FUN=mean, na.rm=T)
# lime.are is also being finicky
# PSM
set.seed(20161102)
m <- lapply(psmList, function(listInput){
naCheck <- unlist(strsplit(gsub("\\+", "", as.vector(listInput$psmVars))," ", fixed=TRUE))
naCheck <- naCheck[-which(naCheck=="")]
# keep complete cases of outcome variable
k <- d[complete.cases(d[,listInput$y]),]
k <- k[complete.cases(k[,listInput$y]),]
# run glm regression:
reg <- glm(as.formula(paste(listInput$tr, "~", listInput$psmVars, sep="")), family= binomial(link="logit"), data=k)
suppressWarnings(
mod <- Match(Y = k[,listInput$y], Tr = k[,listInput$tr], X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")
)
matchRes <- MatchBalance(k[,listInput$tr] ~ k[,listInput$y], match.out=mod, nboots=500, data=k, print.level = 0)
#print(listInput$y)
return(list(mod, matchRes))
})
The models can now vary by outcome. Let’s see if we can improve our results.
matchRes <- do.call(rbind, lapply(1:length(m), function(model){
val <- as.data.frame(cbind(
standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff,
var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
return(val)
}))
namesInput <- NULL
for(i in 1:length(psmList)){
namesInput[i] <- psmList[[i]]$y
}
rownames(matchRes) <- namesInput
print(kable(matchRes))
| standard.diff | var.ratio | sdiff.adj | |
|---|---|---|---|
| m3.Ca | -6.4794848 | 0.8941595 | -0.0647948 |
| m3.Mg | -2.5858964 | 0.9904604 | -0.0258590 |
| pH | -5.2477889 | 1.0041375 | -0.0524779 |
| Total.N | -1.1367538 | 1.0242614 | -0.0113675 |
| Total.C | -0.6494876 | 1.0308477 | -0.0064949 |
| ExAl | 4.7152699 | 1.0254513 | 0.0471527 |
| n_season_fert | 79.2241541 | 3.4210382 | 0.7922415 |
| n_season_compost | 8.4743566 | 0.9611436 | 0.0847436 |
| n_seasons_leg_1 | 8.0648655 | 0.8951985 | 0.0806487 |
| n_season_fallow | 5.5545131 | 1.2544335 | 0.0555451 |
| logFert | 103.4470614 | 1.7397363 | 1.0344706 |
| n_seasons_leg_2 | -13.4710688 | 0.9138884 | -0.1347107 |
| n_season_lime | 13.9533769 | 1.5456851 | 0.1395338 |
| fert1.are | 8.3515976 | 2.2500787 | 0.0835160 |
| compost.are | 7.8808248 | 1.0029775 | 0.0788082 |
Interpretation If I divide the standardized mean differences by 100, we meet the balance criteria of the standardized mean difference being close to 0 and the variance being close to 1. Let’s print out the model results to see how Tubura and control farmers compare on key soil meterics. These results should supercede the naive balance tables presented above.
We achieve acceptable balance for the soil attributes but we don’t for seasons of fertilizer use. This is isn’t entirely unexpected given that Tubura’s primary service is providing fertilizer inputs and training.
coefTable <- do.call(rbind, lapply(1:length(m), function(model){
beta = round(m[[model]][[1]]$est.noadj,3)
mean.Tr = round(m[[model]][[2]]$AfterMatching[[1]][[3]], 2)
mean.Co = round(m[[model]][[2]]$AfterMatching[[1]][[4]], 2)
pval = m[[model]][[2]]$AfterMatching[[1]][[10]][[3]] # p.value
#pval = (1 - pnorm(abs(m[[model]][[1]]$est/m[[model]][[1]]$se.standard))) * 2
pval = ifelse(pval < 0.001, "0.001", round(pval, 3))
res = data.frame(beta, mean.Tr, mean.Co, pval)
return(res)
}))
row.names(coefTable) <- namesInput
coefTable$pval.adj <- round(p.adjust(coefTable$pval, method="fdr"),3)
print(kable(coefTable))
| beta | mean.Tr | mean.Co | pval | pval.adj | |
|---|---|---|---|---|---|
| m3.Ca | -40.669 | 796.70 | 837.37 | 0.052 | 0.097 |
| m3.Mg | -3.230 | 208.99 | 212.22 | 0.421 | 0.486 |
| pH | -0.032 | 5.46 | 5.49 | 0.097 | 0.146 |
| Total.N | 0.000 | 0.15 | 0.15 | 0.725 | 0.777 |
| Total.C | -0.003 | 2.08 | 2.08 | 0.841 | 0.841 |
| ExAl | 0.028 | 0.61 | 0.58 | 0.144 | 0.196 |
| n_season_fert | 2.564 | 3.41 | 0.85 | 0.001 | 0.004 |
| n_season_compost | 0.318 | 5.93 | 5.61 | 0.009 | 0.027 |
| n_seasons_leg_1 | 0.214 | 2.29 | 2.07 | 0.014 | 0.035 |
| n_season_fallow | 0.095 | 0.69 | 0.59 | 0.069 | 0.115 |
| logFert | 0.825 | 1.19 | 0.36 | 0.001 | 0.004 |
| n_seasons_leg_2 | -0.427 | 2.61 | 3.04 | 0.001 | 0.004 |
| n_season_lime | 0.097 | 0.23 | 0.13 | 0.001 | 0.004 |
| fert1.are | 0.172 | 1.22 | 1.05 | 0.312 | 0.390 |
| compost.are | 7.938 | 52.36 | 44.43 | 0.018 | 0.039 |
I’m comparing season 1 clients to clients with more tenure in the PSM matching but I also need this variable for the threshold calculations
# 0 is new client, 1 is returning client
d$tenure.tiers <- ifelse(d$client==1 & d$total.seasons==1, 0,
ifelse(d$client==1 & d$total.seasons>1, 1, NA))
thresh <- d %>% group_by(client) %>% dplyr::summarize(
count = n(),
ph = sum(pH<5.8),
carbon = sum(Total.C < 2),
nitrogen = sum(Total.N < 0.1),
calcium = sum(m3.Ca < 1056),
magnesium = sum(m3.Mg < 148)
#aluminum = sum(ExAl)
) %>% mutate(
under.ph = paste(paste(round(ph/count,4)*100, "%", sep=""), " (", ph, ")", sep=""),
under.carbon = paste(paste(round(carbon/count,4)*100,"%", sep=""), " (", carbon, ")", sep=""),
under.nitrogen = paste(paste(round(nitrogen/count,4)*100, "%", sep=""), " (", nitrogen, ")", sep=""),
under.calcium = paste(paste(round(calcium/count,4)*100, "%", sep=""), " (", calcium, ")", sep=""),
under.mag = paste(paste(round(magnesium/count,4)*100,"%", sep=""), " (", magnesium, ")", sep="")
) %>% as.data.frame()
thresh <- thresh[, c("client", names(thresh)[grep("under", names(thresh))])]
thresh <- t(thresh)
colnames(thresh) = thresh[1, ] # the first row will be the header
colnames(thresh) = c("non-client", "client")
thresh = thresh[-1, ]
write.csv(thresh, file=paste("output", "table1_rw_thresholds.csv", sep = "/"), row.names = T)
New clients vs. returning clients summaries:
newOld <- d %>% filter(!is.na(tenure.tiers)) %>% group_by(tenure.tiers) %>%
summarise(
ph = round(mean(pH, na.rm=T),3),
carbon = round(mean(Total.C, na.rm=T),3),
nitrogen = round(mean(Total.N, na.rm=T),3),
calcium = round(mean(m3.Ca, na.rm=T),3),
magnesium = round(mean(m3.Mg, na.rm=T),3)
) %>% ungroup()
newOld <- as.data.frame(t(newOld))
colnames(newOld) = newOld[1, ] # the first row will be the header
colnames(newOld) = c("New Client", "Returning Client")
newOld = newOld[-1, ]
#write.csv(newOld, file=paste(od, "rw table1 new old values.csv", sep = "/"), row.names = T)
New Clients vs. returning clients thresholds
thresh.t <- d %>% filter(!is.na(tenure.tiers)) %>%
group_by(tenure.tiers) %>% dplyr::summarize(
count = n(),
ph = sum(pH<5.8),
carbon = sum(Total.C < 2),
nitrogen = sum(Total.N < 0.1),
calcium = sum(m3.Ca < 1056),
magnesium = sum(m3.Mg < 148)
#aluminum = sum(ExAl)
) %>% mutate(
under.ph = paste(paste(round(ph/count,4)*100, "%", sep=""), " (", ph, ")", sep=""),
under.carbon = paste(paste(round(carbon/count,4)*100,"%", sep=""), " (", carbon, ")", sep=""),
under.nitrogen = paste(paste(round(nitrogen/count,4)*100, "%", sep=""), " (", nitrogen, ")", sep=""),
under.calcium = paste(paste(round(calcium/count,4)*100, "%", sep=""), " (", calcium, ")", sep=""),
under.mag = paste(paste(round(magnesium/count,4)*100,"%", sep=""), " (", magnesium, ")", sep="")
) %>% as.data.frame()
thresh.t <- thresh.t[, c("tenure.tiers", names(thresh.t)[grep("under", names(thresh.t))] )]
thresh.t <- t(thresh.t)
colnames(thresh.t) = thresh.t[1, ] # the first row will be the header
colnames(thresh.t) = c("new client thold", "returning client thold")
thresh.t = thresh.t[-1, ]
# combine newOld summaries and percents below percentiles
thresh.t <- cbind(newOld, thresh.t)
write.csv(thresh.t, file=paste("output", "table1_rw_thresholds_tenure.csv", sep = "/"), row.names = T)
#
write.csv(coefTable, file=paste("output", "psm coefficients.csv", sep = "/"),
row.names = T)
# sort by the order Eric wants
t1order <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")
table1vars <- paste(t1order, collapse = "|")
table1 <- coefTable[grep(table1vars, rownames(coefTable)), ]
table1 <- table1[order(match(rownames(table1), t1order)),]
write.csv(table1, file=paste("output", "psm coefficients ordered for ES.csv", sep = "/"))
# 11/17 added lime
t2order <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2")
table2vars <- paste(t2order, collapse = "|")
table2 <- coefTable[grep(table2vars, rownames(coefTable)), ]
table2 <- table2[order(match(rownames(table2), t2order)),]
write.csv(table2, file=paste("output", "psm coefficients ordered for ES_agprac.csv", sep = "/"))
#table 3
t3order <- c("fert1.are", "compost.are")
table3vars <- paste(t3order, collapse = "|")
table3 <- coefTable[grep(table3vars, rownames(coefTable)), ]
table3 <- table3[order(match(rownames(table3), t3order)),]
write.csv(table3, file=paste("output", "psm coefficients table3.csv", sep = "/"))
Interpretation: Propensity score matching gives us a comparable treatment and control group. The table above shows that after matching on those characteristics, there are effectively no differences between One Acre Fund farmer and Tubura farmers on soil attributes. The unadjusted p-values show 1AF farmers to have slow levels of soil nitrogen but this finding disappears if we account for running multiple matching models.
When we expand the outcome variable set to include practice variables, we first no longer get a good propensity score match for all variables.
This requires us to re-run the PSM matching process, confirm that our models are sound and then output the results ala table 2 again.
psmList <- list(
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="m3.Ca"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="m3.Mg"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="pH"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="Total.N"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="Total.C"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="ExAl"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_fert"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_compost"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="n_seasons_leg_1"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="n_season_fallow"),
list(tr = "tenure.tiers",
psmVars = paste(coreVars,
collapse=" + "),
y="logFert"),
list(tr = "tenure.tiers",
psmVars = paste(c(coreVars, "age2",
"hhsize2"),
collapse=" + "),
y="n_seasons_leg_2"),
list(tr = "tenure.tiers",
psmVars = paste(c(coreVars, "age2",
"hhsize2"),
collapse=" + "),
y="n_season_lime"),
list(tr = "tenure.tiers",
psmVars = paste(c("female", "age", "hhsize", "own",
"cows", "goats", "chickens", "pigs", "sheep",
"as.factor(district)", "age2", "hhsize2",
"hhsize_age"),
collapse=" + "),
y="fert1.are"),
# list(tr = "tenure.tiers",
# psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
# "cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
# "hhsize_age"),
# collapse=" + "),
# y="fert2.are"),
list(tr = "tenure.tiers",
psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
"cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
"hhsize_age"),
collapse=" + "),
y="compost.are")
)
# fertilizer application in 15B has missing values at the cell level so i include
# a district level control instead. We don't get a good fit with the current model.
# adjust!
# aggregate(d$fert1.are, by=list(d$client, d$cell), FUN=mean, na.rm=T)
# lime.are is also being finicky
# PSM
set.seed(20161102)
mappend <- lapply(psmList, function(listInput){
naCheck <- unlist(strsplit(gsub("\\+", "", as.vector(listInput$psmVars))," ", fixed=TRUE))
naCheck <- naCheck[-which(naCheck=="")]
# keep complete cases of outcome variable
k <- d[complete.cases(d[,listInput$y]),]
k <- k[complete.cases(k[,listInput$y]),]
k <- k[complete.cases(k[,listInput$tr]),]
# run glm regression:
reg <- glm(as.formula(paste(listInput$tr, "~", listInput$psmVars, sep="")), family= binomial(link="logit"), data=k)
suppressWarnings(
mod <- Match(Y = k[,listInput$y], Tr = k[,listInput$tr], X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")
)
matchRes <- MatchBalance(k[,listInput$tr] ~ k[,listInput$y], match.out=mod, nboots=500, data=k, print.level = 0)
#print(listInput$y)
return(list(mod, matchRes))
})
matchRes <- do.call(rbind, lapply(1:length(mappend), function(model){
val <- as.data.frame(cbind(
standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff,
var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
return(val)
}))
namesInput <- NULL
for(i in 1:length(psmList)){
namesInput[i] <- psmList[[i]]$y
}
rownames(matchRes) <- namesInput
print(kable(matchRes))
| standard.diff | var.ratio | sdiff.adj | |
|---|---|---|---|
| m3.Ca | -6.4794848 | 0.8941595 | -0.0647948 |
| m3.Mg | -2.5858964 | 0.9904604 | -0.0258590 |
| pH | -5.2477889 | 1.0041375 | -0.0524779 |
| Total.N | -1.1367538 | 1.0242614 | -0.0113675 |
| Total.C | -0.6494876 | 1.0308477 | -0.0064949 |
| ExAl | 4.7152699 | 1.0254513 | 0.0471527 |
| n_season_fert | 79.2241541 | 3.4210382 | 0.7922415 |
| n_season_compost | 8.4743566 | 0.9611436 | 0.0847436 |
| n_seasons_leg_1 | 8.0648655 | 0.8951985 | 0.0806487 |
| n_season_fallow | 5.5545131 | 1.2544335 | 0.0555451 |
| logFert | 103.4470614 | 1.7397363 | 1.0344706 |
| n_seasons_leg_2 | -13.4710688 | 0.9138884 | -0.1347107 |
| n_season_lime | 13.9533769 | 1.5456851 | 0.1395338 |
| fert1.are | 8.3515976 | 2.2500787 | 0.0835160 |
| compost.are | 7.8808248 | 1.0029775 | 0.0788082 |
coefTable.append <- do.call(rbind, lapply(1:length(mappend), function(model){
beta = round(mappend[[model]][[1]]$est.noadj,3)
mean.Tr = round(mappend[[model]][[2]]$AfterMatching[[1]][[3]], 2)
mean.Co = round(mappend[[model]][[2]]$AfterMatching[[1]][[4]], 2)
pval = mappend[[model]][[2]]$AfterMatching[[1]][[10]][[3]] # p.value
#pval = (1 - pnorm(abs(m[[model]][[1]]$est/m[[model]][[1]]$se.standard))) * 2
pval = ifelse(pval < 0.001, "0.001", round(pval, 3))
res = data.frame(beta, mean.Tr, mean.Co, pval)
return(res)
}))
row.names(coefTable.append) <- namesInput
coefTable.append$pval.adj <- round(p.adjust(coefTable.append$pval, method="fdr"),3)
names(coefTable.append) <- c("beta", "Old Clients", "New Clients", "pval", "pval.adj")
print(kable(coefTable.append))
| beta | Old Clients | New Clients | pval | pval.adj | |
|---|---|---|---|---|---|
| m3.Ca | -46.924 | 824.09 | 871.02 | 0.425 | 0.708 |
| m3.Mg | -3.113 | 208.98 | 212.09 | 0.79 | 0.846 |
| pH | -0.005 | 5.49 | 5.49 | 0.924 | 0.924 |
| Total.N | -0.007 | 0.15 | 0.16 | 0.021 | 0.105 |
| Total.C | -0.071 | 2.04 | 2.11 | 0.089 | 0.270 |
| ExAl | 0.028 | 0.61 | 0.58 | 0.605 | 0.846 |
| n_season_fert | 1.640 | 3.55 | 1.91 | 0.001 | 0.008 |
| n_season_compost | 0.560 | 5.80 | 5.24 | 0.09 | 0.270 |
| n_seasons_leg_1 | 0.213 | 2.28 | 2.07 | 0.389 | 0.708 |
| n_season_fallow | 0.044 | 0.63 | 0.59 | 0.739 | 0.846 |
| logFert | 0.473 | 1.31 | 0.83 | 0.001 | 0.008 |
| n_seasons_leg_2 | 0.102 | 2.36 | 2.26 | 0.702 | 0.846 |
| n_season_lime | 0.050 | 0.23 | 0.18 | 0.305 | 0.654 |
| fert1.are | 0.153 | 1.35 | 1.20 | 0.715 | 0.846 |
| compost.are | 7.693 | 48.77 | 41.08 | 0.243 | 0.607 |
# sort by the order Eric wants
t1order.a <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")
table1vars.a <- paste(t1order.a, collapse = "|")
table1.a <- coefTable.append[grep(table1vars.a, rownames(coefTable.append)), ]
table1.a <- table1.a[order(match(rownames(table1.a), t1order.a)),]
write.csv(table1.a, file=paste("output", "psm coefficients ordered for ES-appendix.csv", sep = "/"))
# 11/17 added lime
t2order.a <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2")
table2vars.a <- paste(t2order.a, collapse = "|")
table2.a <- coefTable.append[grep(table2vars.a, rownames(coefTable.append)), ]
table2.a <- table2.a[order(match(rownames(table2.a), t2order.a)),]
write.csv(table2.a, file=paste("output", "psm coefficients ordered for ES_agprac-appendix.csv", sep = "/"))
#table 3
t3order.a <- c("fert1.are", "compost.are")
table3vars.a <- paste(t3order.a, collapse = "|")
table3.a <- coefTable.append[grep(table3vars.a, rownames(coefTable.append)), ]
table3.a <- table3.a[order(match(rownames(table3.a), t3order.a)),]
write.csv(table3.a, file=paste("output", "psm coefficients table3-appendix.csv", sep = "/"))
See here for some guidance on hwo to use weights to reconstruct the group balance following the matches.
See here for weighted t.test documentation
suppressMessages(library(weights))
tableVars <- c("age", "female", "hhsize", "own")
postMatch <- do.call(rbind, lapply(1:length(m), function(model){
innerPost <- do.call(rbind, lapply(tableVars, function(x){
mean.t = weighted.mean(d[m[[model]][[1]]$index.treated,][,x], m[[model]][[1]]$weights)
mean.c = weighted.mean(d[m[[model]][[1]]$index.control,][,x], m[[model]][[1]]$weights)
# combined data
dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,],
d[m[[model]][[1]]$index.control,]))
test = wtd.t.test(d[m[[model]][[1]]$index.treated,][,x],
d[m[[model]][[1]]$index.control,][,x],
weight=m[[model]][[1]]$weights,
samedata=TRUE)
return(data.frame(model.num = model,
outcome=x,
tr=mean.t,
contr = mean.c,
pval = test$coefficients[3][[1]]))
}))
return(innerPost)
}))
write.csv(postMatch, file=paste("output", "rw post match covars.csv", sep = "/"),
row.names = F)
Per Robert’s suggestion, now that we’ve matched Tubura and non-Tubura farmers, let’s assess the severity of Tubura tenure on key soil health outcomes.
tenureTab.add <- lapply(1:length(m), function(model){
dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,],
d[m[[model]][[1]]$index.control,]))
dm$client_tenure <- dm$client*dm$total.seasons
mod <- lm(as.formula(paste("dm[,psmList[[model]]$y] ~", "total.seasons + as.factor(cell)", sep ="")), data=dm)
return(mod)
})
# add tenured>=3 to d if we want to run this as a binary comparison
# tenureTab.binary <- lapply(1:length(m), function(model){
#
# dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,],
# d[m[[model]][[1]]$index.control,]))
# dm$client_tenure <- dm$client*dm$total.seasons
# mod <- lm(as.formula(paste("dm[,psmList[[model]]$y] ~", "tenured + as.factor(cell)", sep ="")), data=dm)
# return(mod)
# })
# modNames <- c("Calcium", "Magnesium", "pH", "Nitrogen", "Carbon", "Seasons fertilizer", "Seasons compost", "Seasons legumes", "Seasons fallow", "Fertilizer (log)", "Season Sec. legumes", "Seasons Lime", "Fertilizer/Are", "Compost/are")
modNames <- unlist(lapply(psmList, function(x){
return(x$y)
}))
suppressWarnings(
stargazer(tenureTab.add, type="html",
title = "2016A Rwanda Soil Health Baseline - PSM Tenure",
covariate.labels = "One Acre Fund Tenure",
dep.var.labels = "",
#column.labels = modNames,
#column.labels = c(gsub("m3.", "", as.vector(namesInput))),
notes = "Includes FE for cell",
omit=c("cell"), out=paste(od, "rw_baseline_matched_tenure.htm", sep="/"))
)
plm.tenure <- function(x){
intercept = x$coefficients[[1]]
beta = round(x$coefficients[[2]],3)
int.pval = summary(x)$coefficients[1,4]
int.pval = ifelse(int.pval < 0.001, "< 0.001", round(as.numeric(int.pval),3))
beta.pval = summary(x)$coefficients[2,4]
beta.pval = ifelse(beta.pval < 0.001, "< 0.001", round(as.numeric(beta.pval),3))
res = data.frame(intercept, int.pval, beta, beta.pval, stringsAsFactors = F)
return(res)
}
tenure.reg <- do.call(rbind, lapply(tenureTab.add, function(x){
plm.tenure(x)
}))
rownames(tenure.reg) <- modNames
t6order <- c("cows", "pigs", "sheep", "goats", "chickens")
table6vars <- paste(t6order, collapse = "|")
rw.table6 <- output[grep(table6vars, rownames(output)), ]
rw.table6 <- rw.table6[order(match(rownames(rw.table6), t4order)),]
ESorder <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl", "n_season_fert",
"logFert", "n_season_compost", "n_seasons_leg_1", "n_season_fallow")
ESorder.grep <- paste(ESorder, collapse = "|")
tenure.reg <- tenure.reg[grep(ESorder.grep,rownames(tenure.reg)),]
tenure.reg <- tenure.reg[order(match(row.names(tenure.reg), ESorder)),]
write.csv(tenure.reg, file=paste(od, "table13_reg.csv", sep = "/"), row.names=T)
Interpretation: Using a PSM matched sample, the models above assess the effects of additional years of farming with Tubura. Numerous control farmers have also been Tubura farmers in previous seasons. Thus, I’m keeping the model simple instead of adding a client*tenure interaction. We can easily test that as well though.
I draw from Robert’s example in the BGMS soil response notebook.
Scale soil variables to remove units as an issue.
scaledSoil <- scale(d[,soilVars])
clustMethods <- c("ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")
invisible(sapply(clustMethods, FUN=function(method) {
plot(hclust(dist(scaledSoil), method=method), main=method, ylab="", label=FALSE)
}))
Note: I’m honestly not certain what we’re looking for in these outputs. Look for a balanced tree? Ward looks pretty balanced.
hc <- hclust(dist(scaledSoil), method="ward.D2")
d$hc4 <- cutree(hc, k=4)
hcCentroids <- aggregate(list(d[,soilVars]), by=list(cluster=d$hc4), FUN=mean)
hcTable <- rbind(t(table(d$hc4)),
t(round(hcCentroids, 2)))
print(kable(hcTable))
| 1 | 2 | 3 | 4 | |
|---|---|---|---|---|
| 726.00 | 806.00 | 503.00 | 404.00 | |
| cluster | 1.00 | 2.00 | 3.00 | 4.00 |
| m3.Ca | 709.60 | 329.10 | 910.03 | 1893.42 |
| m3.Mg | 210.68 | 111.16 | 236.64 | 371.69 |
| pH | 5.63 | 4.83 | 5.63 | 6.33 |
| Total.N | 0.12 | 0.16 | 0.17 | 0.17 |
| Total.C | 1.65 | 2.32 | 2.20 | 2.23 |
| ExAl | 0.24 | 1.30 | 0.33 | 0.12 |
K-means clustering is a typical unsupervised learning algorithm. We start by identifying the number of clusters we should tell the formula to look for.
numClusters <- 2:10
plot(numClusters,
sapply(numClusters, FUN=function(k) { sum(kmeans(scaledSoil, centers=k, nstart=20, iter.max=20)$withinss) }),
type="b", ylab="average within cluster sum of squared error")
We’re looking for the bend in the graph. It seems to come at 3 but we can try both 3 and 4 and see how they match with our understanding of AEZ and environmental conditions.
set.seed(20161125)
clusters <- 4
d$km4 <- kmeans(scaledSoil, centers=clusters, nstart=20, iter.max=20)$cluster
kmCentroids <- aggregate(list(d[,soilVars]), by=list(cluster=d$km4), FUN=mean)
kmTable <- rbind(t(table(d$km4)),
t(round(kmCentroids, 2)))
print(kable(kmTable))
| 1 | 2 | 3 | 4 | |
|---|---|---|---|---|
| 462.00 | 669.00 | 972.00 | 336.00 | |
| cluster | 1.00 | 2.00 | 3.00 | 4.00 |
| m3.Ca | 1761.59 | 385.48 | 846.72 | 322.50 |
| m3.Mg | 373.34 | 130.16 | 224.14 | 102.13 |
| pH | 6.17 | 4.97 | 5.75 | 4.76 |
| Total.N | 0.19 | 0.14 | 0.13 | 0.19 |
| Total.C | 2.38 | 2.01 | 1.75 | 2.76 |
| ExAl | 0.19 | 0.94 | 0.20 | 1.56 |
These clusters are roughly equally sized. Robert used another method, partitioning around Mediods, to better account for outliers that were causing the algorithm to create a small additional cluster.
Give the clusters brief descriptions
d$km4 <- factor(d$km4, labels=c("pH=5.9, C=1.7",
"pH=6.1,C=2.4",
"pH=5.1, C=2",
"pH=4.8, C=2.6"))
suppressMessages(library(ggmap))
qmplot(lon, lat, data=d, color=d$km4)
Using zoom = 9...
Source : http://tile.stamen.com/toner-lite/9/297/258.png
Source : http://tile.stamen.com/toner-lite/9/298/258.png
Source : http://tile.stamen.com/toner-lite/9/299/258.png
Source : http://tile.stamen.com/toner-lite/9/297/259.png
Source : http://tile.stamen.com/toner-lite/9/298/259.png
Source : http://tile.stamen.com/toner-lite/9/299/259.png
Source : http://tile.stamen.com/toner-lite/9/297/260.png
Source : http://tile.stamen.com/toner-lite/9/298/260.png
Source : http://tile.stamen.com/toner-lite/9/299/260.png
#qmplot(lon, lat, data=d, color=as.factor(hc4))
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)
pal <- colorFactor("RdYlBu", domain=unique(ss$km4))
leaflet() %>% addTiles() %>%
setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
addCircleMarkers(lng=ss$lon, lat=ss$lat,
#radius= ifelse(ss$km4==1, 10,6),
color = pal(ss$km4), clusterOptions = markerClusterOptions(disableClusteringAtZoom=11, spiderfyOnMaxZoom=FALSE)) %>%
addLegend(pal = pal, values=unique(ss$km4), title = "Clusters ")
NA
ggplot(d, aes(x=km4, y=alt)) + geom_boxplot()
Try this when we have aWhere data
print(kable(table(d$km4, d$aez)))
| Central Plateau | Congo Nile | East | Lake Kivu | |
|---|---|---|---|---|
| pH=5.9, C=1.7 | 65 | 11 | 291 | 95 |
| pH=6.1,C=2.4 | 154 | 268 | 57 | 190 |
| pH=5.1, C=2 | 345 | 82 | 253 | 292 |
| pH=4.8, C=2.6 | 30 | 235 | 13 | 58 |
We’re not seeing close alignment between clusters and AEZ. This is probably because AEZ are a mix of altitude, rainfall, temperature and soil. When we have weather data we can try to cluster again and see if we’re more closely aligned.
Alternatively, this could be initial evidence that the AEZ designations are not capturing and separating variation very well. I think we need to start with more data though.
#crdref <- CRS('+proj=longlat +datum=WGS84')
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)
rw <- getData("GADM", country='RW', level=3, path = "/Users/mlowes/drive/optimized_agronomy/soil/soil_health_study/data") # the higher the number, the higher the resolution
#ext.rw.ss matches points with spatial polygons in rw
ext.rw.ss <- extract(rw[, "NAME_3"], ss)
Loading required namespace: rgeos
ss$spatialname <- ext.rw.ss$NAME_3
frw <- fortify(rw, region="NAME_3")
#for soil features
ss.soil <- aggregate(ss@data[,soilVars], by=list(ss@data$spatialname), function(x){
mean(x, na.rm=T)
})
plotReady <- dplyr::left_join(frw, ss.soil, by=c("id"="Group.1"))
ss.count <- ss@data %>% group_by(spatialname) %>% dplyr::summarize(
count = n()
) %>% as.data.frame()
countPlot <- dplyr::left_join(frw, ss.count, by=c("id"="spatialname"))
Generate district density summaries for Nathaniel to pair with the map
districtDensity <- d %>% group_by(district) %>% dplyr::summarize(
count = n()
) %>% as.data.frame()
cellDensity <- d %>% group_by(district, cell) %>% dplyr::summarize(
count = n()
) %>% as.data.frame()
write.csv(districtDensity, file=paste("output", "rw district density table.csv", sep = "/"),
row.names=T)
write.csv(cellDensity, file=paste("output", "rw cell density table.csv", sep = "/"),
row.names=T)
library(RColorBrewer)
mapRes <- ggplot(countPlot, aes(x=long, y=lat, group=group)) + geom_path() +
geom_polygon(aes(fill=countPlot$count)) +
#coord_map(xlim = c(33.5, 36),ylim = c(-2, 1.75)) +
scale_fill_gradientn(colours = brewer.pal(9,"Reds"), # define colors
name = "Sampling Density",
guide = guide_colorbar(legend.direction = "vertical")) +
theme_bw() +
labs(title="Rwanda soil health study sampling density", x = "Longitude", y="Latitude")
print(mapRes)
pdf(paste(od, "rwanda sampling density.pdf", sep = "/"), width=11, height=8.5)
print(mapRes)
dev.off()
quartz_off_screen
2
Consider revising these maps to a smaller greographic unit. Add the name of the location for uninitiated users.
library(RColorBrewer)
mapList <- list()
for(i in 1:length(soilVars)){
mapRes <- ggplot(plotReady, aes(x=long, y=lat, group=group)) + geom_path() +
geom_polygon(aes(fill=plotReady[,soilVars[i]])) +
scale_fill_gradientn(colours = rev(brewer.pal(9,"Reds")), # define colors
name = soilVars[i],
guide = guide_colorbar(legend.direction = "vertical")) +
theme_bw() +
labs(title=paste("Rwanda long term soil health baseline - 2016", soilVars[i], sep= " "), x = "Longitude", y="Latitude")
mapList[[i]] <- mapRes
print(mapRes)
}
# This is a small experiment to combine raster (spdf) and leaflet and be able to access the data in the raster interactively.
#mapLayer <- sp::merge(rw, ss.soil, by.x="NAME_3", by.y="Group.1")
# fill = T, fillOpacity = 0.7, fillColor = d.fill,
# stroke = T, color = "white", weight = 2, dashArray = 3,
# opacity = 0.5, popup = county.tt(d)
# leaflet(mapLayer) %>% addTiles() %>%
# setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
# addPolygons(
# stroke = TRUE, opacity=0.2, smoothFactor = 0.5,
# fillColor=mapLayer$pH, fillOpacity = 0.5)
pdf(file=paste(md, "rw_shs_baseline_soil_maps.pdf", sep = "/"), width=11, height=8.5)
Error in paste(md, "rw_shs_baseline_soil_maps.pdf", sep = "/") :
object 'md' not found
Interpolate soil health values for full operating area using soil health study values. We want to eventually add all Rwandan soil values into a single dataset to update and hone these values. See here for more guidanace
note:
The code below will run 5 K-fold cross validation to compare interpolation models. The output will be fed into the interpolate leaflet code below.
Check that I’m handling the projection correctly with Robert
Now loop over the variables of interest where x is the soilVar variable.
Print out the interpolated values for inclusion in the report.
pdf(file=paste(md, "rw_shs_bl_interpolation_soil_vars.pdf", sep = '/'), width=11, height=8.5)
lapply(soilVars, function(x) {
m <- Tps(coordinates(ss), ss@data[,x])
# make raster layer with model, raster is rwanda empty raster, model is m
tps <- interpolate(r, m)
tps <- crop(tps, rw)
tps <- raster::mask(tps, rw) # cuts the tps raster down to the rw boundaries
x <- gsub("m3.", "", x)
suppressMessages(
outPlot <- invisible(print(
plot(tps, main= paste("Soil TPS Interpolation ", x, sep="- ")),
plot(rw, add=T, na.print=NULL)
))
)
})
NULL
NULL
NULL
NULL
NULL
NULL
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
dev.off()
null device
1
–end